In [None]:
import tensorflow as tf
import numpy as np
import csv
import re
import pandas as pd
import matplotlib.pyplot as plt 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv('/content/stock_data.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [None]:
df.head()
df.shape

(5791, 2)

In [None]:

sentences = np.array(df['Text'].values)

In [None]:
sentences

array(['Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ  trade method 1 or method 2, see prev posts',
       'user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year.  AWESOME.  ',
       "user I'd be afraid to short AMZN - they are looking like a near-monopoly in eBooks and infrastructure-as-a-service",
       ...,
       'Workers at Bajaj Auto have agreed to a 10% wage cut for the period between April 15 and till the lockdown is lifted. https://t.co/RgvrKPliNd',
       '#Sharemarket LIVE: Sensex off day’s high, up 600 points, #Nifty tests 9,200, #TCS, private bank stocks lead\nhttps://t.co/3xgtLroKUI',
       "#Sensex, #Nifty climb off day's highs, still up 2%; Key factors driving D-Street higher today https://t.co/jVQcousFp6"],
      dtype=object)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)   
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)   
    text = re.sub(r'www.[^ ]+', '', text)  
    text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*', '', text)  
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text

In [None]:
cleared_sentences =[]
for sentence in sentences:
  sentence= clean_text(sentence)
  cleared_sentences.append(sentence)

In [None]:
cleared_sentences

['kickers on my watchlist xide tit soq pnk cpw bpz aj  trade method   or method    see prev posts',
 'user  aap movie      return for the fea geed indicator just    trades for the year   awesome   ',
 'user i d be afraid to short amzn   they are looking like a near monopoly in ebooks and infrastructure as a service',
 'mnta over        ',
 'oi  over        ',
 'pgnx  over       ',
 'aap   user if so then the current downtrend will break  otherwise just a short term correction in med term downtrend ',
 'monday s relative weakness  nyx win tie tap ice int bmc aon c chk biib  ',
 'goog   ower trend line channel test   volume support    ',
 'aap will watch tomorrow for ong entry ',
 'i m assuming fcx opens tomorrow above the       trigger buy  still very much like this setup   ',
 'it really worries me how everyone expects the market to rally now usually exact opposite happens every time we shall see soon bac spx jpm',
 'aap gamco s arry haverty   apple is extremely cheap  great video    '

In [None]:
tokenizer = Tokenizer(num_words=5000)

In [None]:
tokenizer.fit_on_texts(cleared_sentences)

In [None]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'to': 2,
 'a': 3,
 'on': 4,
 'in': 5,
 'of': 6,
 'for': 7,
 'aap': 8,
 'and': 9,
 'is': 10,
 'user': 11,
 'it': 12,
 'i': 13,
 'at': 14,
 's': 15,
 'this': 16,
 'short': 17,
 'up': 18,
 'will': 19,
 'from': 20,
 'over': 21,
 'here': 22,
 'with': 23,
 'today': 24,
 'be': 25,
 'day': 26,
 'that': 27,
 'out': 28,
 'as': 29,
 'volume': 30,
 'like': 31,
 'are': 32,
 'but': 33,
 'long': 34,
 'if': 35,
 'now': 36,
 'not': 37,
 'you': 38,
 't': 39,
 'good': 40,
 'stock': 41,
 'has': 42,
 'my': 43,
 'goog': 44,
 'more': 45,
 'some': 46,
 'above': 47,
 'new': 48,
 'watch': 49,
 'bac': 50,
 'down': 51,
 'stop': 52,
 'still': 53,
 'have': 54,
 'nice': 55,
 'we': 56,
 'back': 57,
 'buy': 58,
 'after': 59,
 'next': 60,
 'move': 61,
 'market': 62,
 'coronavirus': 63,
 'higher': 64,
 'time': 65,
 'by': 66,
 'off': 67,
 'all': 68,
 'no': 69,
 'so': 70,
 'see': 71,
 'just': 72,
 'an': 73,
 'ong': 74,
 'week': 75,
 'one': 76,
 'sensex': 77,
 'or': 78,
 'triangle': 79,
 'm': 80,
 'trade': 81,


In [None]:
sequence = tokenizer.texts_to_sequences(cleared_sentences)

In [None]:
padded_sequence =pad_sequences(sequence)

In [None]:
padded_sequence.shape

(5791, 32)

In [None]:
labels = pd.get_dummies(df['Sentiment']).values

In [None]:
labels

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [None]:
training_labels = labels[:5000]
testing_labels =labels[5000:]

In [None]:
training_padded = padded_sequence[:5000]
testing_padded = padded_sequence[5000:]

In [None]:
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Embedding(5000,128,input_length =padded_sequence.shape[1]),
                                    tf.keras.layers.SpatialDropout1D(0.9),
                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,return_sequences=True)),
                                    tf.keras.layers.Dropout(0.9),
                                    tf.keras.layers.LSTM(116,dropout=0.2,recurrent_dropout=0.2),
                                    tf.keras.layers.Dense(4,activation='softmax')

])

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(training_padded,training_labels,validation_data= (testing_padded,testing_labels),epochs=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f38938695c0>

In [None]:
twt = ['Air India Suspends Contracts Of Around 200 Pilots Amid COVID-19: Report\nhttps://t.co/ruL87QaEfb']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(0<np.argmax(sentiment)<1):
    print("negative")
elif (np.argmax(sentiment) >= 1):
    print("positive")

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 1357  482 1129    6  209  130  616  379 1955   39  562]]
1/1 - 0s
positive


In [None]:
np.argmax(sentiment)

1

In [None]:


sentences[5545]

'Air India Suspends Contracts Of Around 200 Pilots Amid COVID-19: Report\nhttps://t.co/ruL87QaEfb'