In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import re
import string
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [3]:
train_path = r"/content/drive/MyDrive/TweetData/Corona_NLP_train.csv"
test_path = r"/content/drive/MyDrive/TweetData/Corona_NLP_test.csv"

In [4]:
df_train = pd.read_csv(train_path, encoding='latin1')
df_train = df_train.iloc[:,-2:]
df_train.Sentiment = df_train.Sentiment.replace('Extremely Positive', 'Positive')
df_train.Sentiment = df_train.Sentiment.replace('Extremely Negative', 'Negative')


df_test = pd.read_csv(test_path, encoding='latin1')
df_test = df_test.iloc[:,-2:]
df_test.Sentiment = df_test.Sentiment.replace('Extremely Positive', 'Positive')
df_test.Sentiment = df_test.Sentiment.replace('Extremely Negative', 'Negative')

In [5]:
# X_train = df_train.OriginalTweet
# X_test = df_test.OriginalTweet

# y_train = df_train.Sentiment
# y_test = df_test.Sentiment

df = df_train.append(df_test, ignore_index=True)
df.head(2)

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive


In [6]:
df['Sentiment'].value_counts()

Positive    19592
Negative    17031
Neutral      8332
Name: Sentiment, dtype: int64

In [7]:
df.tail()

Unnamed: 0,OriginalTweet,Sentiment
44950,Meanwhile In A Supermarket in Israel -- People...,Positive
44951,Did you panic buy a lot of non-perishable item...,Negative
44952,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
44953,Gov need to do somethings instead of biar je r...,Negative
44954,I and @ForestandPaper members are committed to...,Positive


In [8]:
pos_data = df[df['Sentiment']=='Positive'].sample(n=8332)
neg_data = df[df['Sentiment']=='Negative'].sample(n=8332)
neu_data = df[df['Sentiment']=='Neutral']

data = pd.concat([pos_data,neg_data,neu_data],axis=0,ignore_index=True)

data

Unnamed: 0,OriginalTweet,Sentiment
0,#JantaCurfew #Covid_19india\r\r\nEven though o...,Positive
1,While we all take the necessary steps to maint...,Positive
2,"I walked into a supermarket today, some ppl la...",Positive
3,Coronavirus impact on real estate will vary by...,Positive
4,Some folks are worried about not being able to...,Positive
...,...,...
24991,The financial &amp; economic impact of COVID-1...,Neutral
24992,Waiting in a line in 1 meter distance from eac...,Neutral
24993,In light of the ongoing and rapidly evolving C...,Neutral
24994,You never eaten the pigs cat dog or food from ...,Neutral


In [9]:
data = data.sample(frac=1, axis=0).reset_index(drop=True)
data

Unnamed: 0,OriginalTweet,Sentiment
0,?NEVER will forget this active betrayal of Ame...,Neutral
1,I was watching TV and. An @ESPN as can be on a...,Neutral
2,"Oil prices skids after Saudi-Russia talks, sto...",Neutral
3,I'm getting anxious when there's 2 other peopl...,Positive
4,Can you help My Mam is living with late stage ...,Positive
...,...,...
24991,Oil prices volatile at multi-year lows amid #c...,Negative
24992,Found this at @Target \r\r\n\r\r\nYou see some...,Neutral
24993,"HUL reduces prices of Lifebuoy sanitizers, Liq...",Positive
24994,@FirstSouthYorks how about letting pensioners ...,Positive


In [10]:
data.Sentiment.value_counts()

Neutral     8332
Positive    8332
Negative    8332
Name: Sentiment, dtype: int64

In [11]:
num_classes = 3
print(len(data['Sentiment'].unique()))

3


In [12]:
class_names = ['Neutral','Positive','Negative']
print(data['Sentiment'].unique())

['Neutral' 'Positive' 'Negative']


In [13]:
data['Sentiment'] = data['Sentiment'].replace({'Neutral':0, 'Positive':1, 'Negative':2})
data

Unnamed: 0,OriginalTweet,Sentiment
0,?NEVER will forget this active betrayal of Ame...,0
1,I was watching TV and. An @ESPN as can be on a...,0
2,"Oil prices skids after Saudi-Russia talks, sto...",0
3,I'm getting anxious when there's 2 other peopl...,1
4,Can you help My Mam is living with late stage ...,1
...,...,...
24991,Oil prices volatile at multi-year lows amid #c...,2
24992,Found this at @Target \r\r\n\r\r\nYou see some...,0
24993,"HUL reduces prices of Lifebuoy sanitizers, Liq...",1
24994,@FirstSouthYorks how about letting pensioners ...,1


In [14]:
# import nltk
# from nltk.corpus import stopwords

# nltk.download('stopwords')

In [15]:
# import nltk
# nltk.download('wordnet')

In [16]:
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
  
# lemmatizer = WordNetLemmatizer()
  
def clean_text(data):
    
    # remove hashtags and @usernames
    data = re.sub(r'http\S+', '', data)
    data = re.sub('[^a-zA-Z]', ' ', data)
    
    # Lowering Text
    data = data.lower()

    # Removing Stop-words
    # stopwords_dict = {word: 1 for word in stopwords.words("english")}
    # data = " ".join([word for word in data.split() if word not in stopwords_dict])
    
    # lemmatizing words
    # data = lemmatizer.lemmatize(data)
    
    # tokenization of sentences
    data = word_tokenize(data)
    
    return data

In [17]:
from sklearn.model_selection import train_test_split

X = data['OriginalTweet']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [18]:
import nltk
nltk.download('punkt')
texts = [' '.join(clean_text(text)) for text in df.OriginalTweet]

texts_train = [' '.join(clean_text(text)) for text in X_train]
texts_test = [' '.join(clean_text(text)) for text in X_test]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# preparing input to our model
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

sequence_train = tokenizer.texts_to_sequences(texts_train)
sequence_test = tokenizer.texts_to_sequences(texts_test)

index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

print('Number of unique words: {}'.format(len(index_of_words)))

Number of unique words: 54647


In [20]:
print(len(texts_train))
print(len(texts_test))

19996
5000


In [21]:
print(texts[0])
print(len(texts[0])) # this is number of characters NOT IMP

menyrbie phil gahan chrisitv and and
36


In [22]:
lst = []

for i in texts:
  lst.append(len(i.split()))
val = max(lst)                             # Number of Words IMP
val

65

In [23]:
# Max input length (max number of words) 
max_seq_len = val+3

from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(sequence_train, maxlen = max_seq_len)
X_test_pad = pad_sequences(sequence_test, maxlen = max_seq_len )

X_train_pad

array([[    0,     0,     0, ...,  1045,     7,   443],
       [    0,     0,     0, ...,     1,   781,   207],
       [    0,     0,     0, ...,    27, 14904,   882],
       ...,
       [    0,     0,     0, ...,    14,    81,   181],
       [    0,     0,     0, ...,    65,     1,   569],
       [    0,     0,     0, ...,   314,   124,     8]], dtype=int32)

In [24]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [25]:
y_train

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [26]:
print(len(y_train))
print(len(y_test))

19996
5000


In [27]:
!nvidia-smi

Thu Jun  2 03:41:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# GRU

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, GRU
from tensorflow.keras.callbacks import EarlyStopping

embedding_vector_features=128
max_seq_len = val+3

# embedding_vector_features=40
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_features,input_length=max_seq_len))
model.add(Dropout(0.5))

model.add(GRU(64,return_sequences=True))
model.add(Dropout(0.5))

model.add(GRU(32,return_sequences=True))
model.add(Dropout(0.5))

model.add(GRU(10))
model.add(Dropout(0.5))

model.add(Dense(3,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 68, 128)           6994944   
                                                                 
 dropout (Dropout)           (None, 68, 128)           0         
                                                                 
 gru (GRU)                   (None, 68, 64)            37248     
                                                                 
 dropout_1 (Dropout)         (None, 68, 64)            0         
                                                                 
 gru_1 (GRU)                 (None, 68, 32)            9408      
                                                                 
 dropout_2 (Dropout)         (None, 68, 32)            0         
                                                                 
 gru_2 (GRU)                 (None, 10)                1

In [29]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

model.fit(X_train_pad, y_train,epochs=200,batch_size=32,validation_data=(X_test_pad, y_test),callbacks=[early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 8: early stopping


<keras.callbacks.History at 0x7f4b9009a590>

In [30]:
model.save('/content/drive/MyDrive/TweetData/FINAL_GRU_output.h5')

In [31]:
predictions = model.predict(X_test_pad)
predictions = np.argmax(predictions, axis=1)
predictions = [class_names[pred] for pred in predictions]

In [32]:
import time

message = ['stocks price dose not changed today']

seq = tokenizer.texts_to_sequences(message)
padded = pad_sequences(seq, maxlen=max_seq_len)

start_time = time.time()
pred = model.predict(padded)

print('Message: ' + str(message))
print('predicted: {} ({:.2f} seconds)'.format(class_names[np.argmax(pred)], (time.time() - start_time)))

Message: ['stocks price dose not changed today']
predicted: Neutral (0.05 seconds)


In [33]:
!python --version

Python 3.7.13


________________________________________________________________________________

# LSTM

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, GRU
from tensorflow.keras.callbacks import EarlyStopping

embedding_vector_features=128
max_seq_len = val+3

# embedding_vector_features=40
model2=Sequential()
model2.add(Embedding(vocab_size,embedding_vector_features,input_length=max_seq_len))
model.add(Dropout(0.5))

model2.add(LSTM(64,return_sequences=True))
model.add(Dropout(0.5))

model2.add(LSTM(32,return_sequences=True))
model.add(Dropout(0.5))

model2.add(LSTM(10))
model.add(Dropout(0.5))

model2.add(Dense(3,activation='softmax'))
model2.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model2.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 68, 128)           6994944   
                                                                 
 lstm (LSTM)                 (None, 68, 64)            49408     
                                                                 
 lstm_1 (LSTM)               (None, 68, 32)            12416     
                                                                 
 lstm_2 (LSTM)               (None, 10)                1720      
                                                                 
 dense_1 (Dense)             (None, 3)                 33        
                                                                 
Total params: 7,058,521
Trainable params: 7,058,521
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

model2.fit(X_train_pad, y_train,epochs=200,batch_size=32,validation_data=(X_test_pad, y_test),callbacks=[early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 7: early stopping


<keras.callbacks.History at 0x7f4b173af790>

In [36]:
model2.save('/content/drive/MyDrive/TweetData/FINAL_LSTM_output.h5')

In [37]:
predictions = model2.predict(X_test_pad)
predictions = np.argmax(predictions, axis=1)
predictions = [class_names[pred] for pred in predictions]

In [38]:
import time

message = ['stocks price dose not changed today']

seq = tokenizer.texts_to_sequences(message)
padded = pad_sequences(seq, maxlen=max_seq_len)

start_time = time.time()
pred = model2.predict(padded)

print('Message: ' + str(message))
print('predicted: {} ({:.2f} seconds)'.format(class_names[np.argmax(pred)], (time.time() - start_time)))

Message: ['stocks price dose not changed today']
predicted: Neutral (0.05 seconds)


_______________________________________________________________________________

# Bidirectional LSTM

In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, GRU
from tensorflow.keras.callbacks import EarlyStopping

embedding_vector_features=128
max_seq_len = val+3

# embedding_vector_features=40
model3=Sequential()
model3.add(Embedding(vocab_size,embedding_vector_features,input_length=max_seq_len))
model.add(Dropout(0.5))

model3.add(Bidirectional(LSTM(64,return_sequences=True)))
model.add(Dropout(0.5))

model3.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Dropout(0.5))

model3.add(Bidirectional(LSTM(10)))
model.add(Dropout(0.5))

model3.add(Dense(3,activation='softmax'))
model3.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model3.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 68, 128)           6994944   
                                                                 
 bidirectional (Bidirectiona  (None, 68, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 68, 64)           41216     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 20)               6000      
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 3)                 63        
                                                      

In [40]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

model3.fit(X_train_pad, y_train,epochs=200,batch_size=32,validation_data=(X_test_pad, y_test),callbacks=[early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 7: early stopping


<keras.callbacks.History at 0x7f4b1586f9d0>

In [41]:
model3.save('/content/drive/MyDrive/TweetData/FINAL_BI_LSTM_output.h5')

In [42]:
predictions = model3.predict(X_test_pad)
predictions = np.argmax(predictions, axis=1)
predictions = [class_names[pred] for pred in predictions]

In [43]:
import time

message = ['stocks price dose not changed today']

seq = tokenizer.texts_to_sequences(message)
padded = pad_sequences(seq, maxlen=max_seq_len)

start_time = time.time()
pred = model3.predict(padded)

print('Message: ' + str(message))
print('predicted: {} ({:.2f} seconds)'.format(class_names[np.argmax(pred)], (time.time() - start_time)))

Message: ['stocks price dose not changed today']
predicted: Neutral (0.05 seconds)


_______________________________________________________________________________

# Bidirectional GRU

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, GRU
from tensorflow.keras.callbacks import EarlyStopping

embedding_vector_features=128
max_seq_len = val+3

# embedding_vector_features=40
model4=Sequential()
model4.add(Embedding(vocab_size,embedding_vector_features,input_length=max_seq_len))
model.add(Dropout(0.5))

model4.add(Bidirectional(GRU(64,return_sequences=True)))
model.add(Dropout(0.5))

model4.add(Bidirectional(GRU(32,return_sequences=True)))
model.add(Dropout(0.5))

model4.add(Bidirectional(GRU(10)))
model.add(Dropout(0.5))

model4.add(Dense(3,activation='softmax'))
model4.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model4.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 68, 128)           6994944   
                                                                 
 bidirectional_3 (Bidirectio  (None, 68, 128)          74496     
 nal)                                                            
                                                                 
 bidirectional_4 (Bidirectio  (None, 68, 64)           31104     
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 20)               4560      
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 3)                 63        
                                                      

In [45]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

model4.fit(X_train_pad, y_train,epochs=200,batch_size=32,validation_data=(X_test_pad, y_test),callbacks=[early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 7: early stopping


<keras.callbacks.History at 0x7f4b15124510>

In [46]:
model4.save('/content/drive/MyDrive/TweetData/FINAL_BI_GRU_output.h5')

In [47]:
predictions = model4.predict(X_test_pad)
predictions = np.argmax(predictions, axis=1)
predictions = [class_names[pred] for pred in predictions]

In [48]:
import time

message = ['stocks price dose not changed today']

seq = tokenizer.texts_to_sequences(message)
padded = pad_sequences(seq, maxlen=max_seq_len)

start_time = time.time()
pred = model4.predict(padded)

print('Message: ' + str(message))
print('predicted: {} ({:.2f} seconds)'.format(class_names[np.argmax(pred)], (time.time() - start_time)))

Message: ['stocks price dose not changed today']
predicted: Neutral (0.05 seconds)
