# Importing important libraries and loading data 

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split
from string import punctuation
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
import sklearn.metrics
from keras.preprocessing.sequence import pad_sequences
import tensorflow_addons.metrics
from nltk.corpus import stopwords

%matplotlib inline

msg=pd.read_csv("ps5_tweets_text.csv")
label=pd.read_csv("ps5_tweets_labels_as_numbers.csv")


Using TensorFlow backend.


# Data Exploration

In [2]:
#looking at tweets to see what all needs to be done for cleaning 
msg['Tweet'].iloc[0]

'https://t.co/UpjxfOgQs8\r\r\n\r\r\nGaisss! Please read this,and please limit yourself to go outside and please,please..always wash your hands,always use the hand sanitizer. \r\r\n\r\r\nAnd please get ready to stock up the food.'

In [3]:
#Counts of all classes to check imbalance
label['Label'].value_counts()

3    10282
1     8930
2     6930
4     5953
0     4946
Name: Label, dtype: int64

# Train-test split

In [4]:
#joining data to form train and test set
full_data=pd.concat([msg,label[['Label']]],axis=1,join='inner')

In [5]:
#splitting data into train,validation and test sets
X_train, X_test, y_train, y_test = train_test_split( full_data[['Tweet']], full_data[['Label']], test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

# Data Preprocessing

In [6]:
#cleaning train set tweets by removing URLs, usernames and '#', and converting all words to lowercase
train_data=[]
for i in range(len(X_train)):

    temp = X_train['Tweet'].iloc[i].lower() # converting text to lower-case
    temp = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',temp) # removing URLs
    temp = re.sub('@[^\s]+', '', temp) # removing usernames
    temp = re.sub(r'#([^\s]+)', '', temp) # removing the # in #hashtag
    temp = word_tokenize(temp)        #splitting string into list of tokens
    train_data.append(temp) 

In [7]:
#removing stop words from tweets
stopwords=stopwords.words('english')+list(punctuation)
for i in range(len(train_data)):  
    train_data[i] = [word for word in  train_data[i] if word not in stopwords]

# Calculating vocabulary

In [8]:
#making vocab of all words in training data
all_words=[]
for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        all_words.append(train_data[i][j])

wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()


In [9]:
#vocab size
len(word_features)

28427

# Tokenizing data

In [10]:
#using tensorflow tokenizer to vectorize words and fitting it to train set to form embedding
tokenizer=Tokenizer(num_words=len(word_features))
tokenizer.fit_on_texts(train_data)

In [11]:
#using tokenizer converting train data to one hot vectors
one_hot_res=tokenizer.texts_to_matrix(train_data,mode='binary')

In [12]:
one_hot_res.shape

(22224, 28427)

# Training models


 **Model 1 - Multinomial Naive Bayes classifier**

In [13]:
clf = MultinomialNB()
clf.fit(one_hot_res, y_train['Label'])

MultinomialNB()

In [14]:
#accuracy for train set
clf.score(one_hot_res,y_train)

0.765703743700504

In [15]:
#applying same transformations to validation and test set
val_data=[]
for i in range(len(X_val)):

    temp = X_val['Tweet'].iloc[i].lower() # converting text to lower-case
    temp = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',temp) # removing URLs
    temp = re.sub('@[^\s]+', '', temp) # removing usernames
    temp = re.sub(r'#([^\s]+)', '', temp) # removing the # in #hashtag
    temp = word_tokenize(temp) #splitting string into list of tokens
    val_data.append(temp)
    
test_data=[]
for i in range(len(X_test)):

    temp = X_test['Tweet'].iloc[i].lower() # converting text to lower-case
    temp = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',temp) # removing URLs
    temp = re.sub('@[^\s]+', '', temp) # removing usernames
    temp = re.sub(r'#([^\s]+)', '', temp) # removing the # in #hashtag
    temp = word_tokenize(temp) #splitting string into list of tokens
    test_data.append(temp) 

In [16]:
for i in range(len(val_data)):  
    val_data[i] = [word for word in  val_data[i] if word not in stopwords]

for i in range(len(test_data)):  
    test_data[i] = [word for word in  test_data[i] if word not in stopwords]

In [17]:
one_hot_val=tokenizer.texts_to_matrix(val_data,mode='binary')
one_hot_test=tokenizer.texts_to_matrix(test_data,mode='binary')

In [18]:
#predictions of Multinomial Naive Bayes Classifier models with validation and test set
y_pred_val1=clf.predict(one_hot_val)
y_pred_test1=clf.predict(one_hot_test)

In [19]:
#Macro F1 score for validation set
f1macro_MNB_val=f1_score(y_val, y_pred_val1, average='macro')
f1macro_MNB_val

0.44377646382609964

In [20]:
#Micro F1 score for validation set
f1micro_MNB_val=f1_score(y_val, y_pred_val1, average='micro')
f1micro_MNB_val

0.4619330453563715

In [21]:
#accuracy for validation set
acc_MNB_val=clf.score(one_hot_val,y_val)
acc_MNB_val

0.4619330453563715

In [22]:
#Macro F1 score for test set
f1macro_MNB_test=f1_score(y_test, y_pred_test1, average='macro')
f1macro_MNB_test

0.44208616742657725

In [23]:
#Micro F1 score for test set
f1micro_MNB_test=f1_score(y_test, y_pred_test1, average='micro')
f1micro_MNB_test

0.45836145228775815

In [24]:
#accuracy for test set
acc_MNB_test=clf.score(one_hot_test,y_test)
acc_MNB_test

0.45836145228775815

**Model 2 - Complement Naive Bayes Classifier**

In [25]:
clf2 = sklearn.naive_bayes.ComplementNB()
clf2.fit(one_hot_res, y_train['Label'])

ComplementNB()

In [26]:
#accuracy for train set 
clf2.score(one_hot_res,y_train)

0.8103851691864651

In [27]:
#predictions of Complement Naive Bayes Classifier models with validation and test set
y_pred_val2=clf2.predict(one_hot_val)
y_pred_test2=clf2.predict(one_hot_test)

In [28]:
#Macro F1 score for validation set
f1macro_CNB_val=f1_score(y_val, y_pred_val2, average='macro')
f1macro_CNB_val

0.4532259942099035

In [29]:
#Micro F1 score for validation set
f1micro_CNB_val=f1_score(y_val, y_pred_val2, average='micro')
f1micro_CNB_val

0.44681425485961124

In [30]:
#accuracy for validation set
acc_CNB_val=clf.score(one_hot_val,y_val)
acc_CNB_val

0.4619330453563715

In [31]:
#Macro F1 score for test set
f1macro_CNB_test=f1_score(y_test, y_pred_test2, average='macro')
f1macro_CNB_test

0.45424342055810707

In [32]:
#Micro F1 score for test set
f1micro_CNB_test=f1_score(y_test, y_pred_test2, average='micro')
f1micro_CNB_test

0.44715886084491835

In [33]:
#accuracy for test set
acc_CNB_test=clf.score(one_hot_test,y_test)
acc_CNB_test

0.45836145228775815

**Model 3 - RNN architecture 1**

In [34]:
#Train test split for RNN, no validation set because cross validation split will be done later
X_train, X_test, y_train, y_test = train_test_split( full_data[['Tweet']], full_data[['Label']], test_size=0.20, random_state=42)

In [35]:
#applying same transformations for new train set and test set
train_data=[]
for i in range(len(X_train)):

    temp = X_train['Tweet'].iloc[i].lower() # convert text to lower-case
    temp = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',temp) # remove URLs
    temp = re.sub('@[^\s]+', '', temp) # remove usernames
    temp = re.sub(r'#([^\s]+)', '', temp) # remove the # in #hashtag
    temp = word_tokenize(temp)  #splitting string into list of tokens
    train_data.append(temp)

test_data=[]
for i in range(len(X_test)):

    temp = X_test['Tweet'].iloc[i].lower() # convert text to lower-case
    temp = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',temp) # remove URLs
    temp = re.sub('@[^\s]+', '', temp) # remove usernames
    temp = re.sub(r'#([^\s]+)', '', temp) # remove the # in #hashtag
    temp = word_tokenize(temp)  #splitting string into list of tokens
    test_data.append(temp)


In [36]:
for i in range(len(train_data)):  
    train_data[i] = [word for word in  train_data[i] if word not in stopwords]

for i in range(len(test_data)):  
    test_data[i] = [word for word in  test_data[i] if word not in stopwords]

# Calculating vocab again

In [37]:
all_words=[]
for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        all_words.append(train_data[i][j])

wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()
len(word_features)

33276

# Tokenizing data

In [38]:
tokenizer=Tokenizer(num_words=len(word_features))
tokenizer.fit_on_texts(train_data)

In [39]:
#converting train and test data to sequence of numbers
train_data_final=tokenizer.texts_to_sequences(train_data)
test_data_final=tokenizer.texts_to_sequences(test_data)

# Padding train and test set

In [40]:
#making length of train and test data instances uniform by padding with zeros
maxlen = 100
X_train = pad_sequences(train_data_final, padding='post', maxlen=maxlen)
X_test = pad_sequences(test_data_final, padding='post', maxlen=maxlen)

In [41]:
#converting target labels into one hot vectors
y_train_one_hot=tf.one_hot(y_train['Label'],5)
y_test_one_hot=tf.one_hot(y_test['Label'],5)

# Training the model 

In [42]:
#Trying the RNN architecture with one LSTM layer and one Dense layer
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(word_features),
        output_dim=250,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(5,activation='softmax')
    
])

In [43]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy',tensorflow_addons.metrics.F1Score(num_classes=5,average='macro',name='F1_macro'),
              tensorflow_addons.metrics.F1Score(num_classes=5,average='micro',name='F1_micro')])

In [44]:
model.fit(X_train, y_train_one_hot,epochs=4,validation_split=0.2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x27987f89ec8>

# Evaluating model

In [45]:
model1_loss,model1_acc,model1_f1macro,model1_f1micro=model.evaluate(X_test,y_test_one_hot)



In [46]:
#predicting labels for test set 
y_test_pred=np.argmax(model.predict(X_test),axis=-1)

In [47]:
y_test_pred

array([4, 2, 3, ..., 2, 1, 1], dtype=int64)

In [48]:
#confusion matrix
tf.math.confusion_matrix(y_test['Label'], y_test_pred, num_classes=5, weights=None, dtype=tf.int32)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[ 580,  340,   24,   26,    5],
       [ 145, 1123,  230,  262,   10],
       [   3,  152, 1060,  212,    5],
       [  15,  253,  162, 1463,  136],
       [   3,   24,    8,  394,  774]])>

**Model 2 - RNN architecture 2**

# Training the model

In [49]:
#Second RNN model with two LSTM layers and a dropout layer
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(word_features),
        output_dim=250,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(5,activation='softmax')
    
])

In [50]:
model2.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy',tensorflow_addons.metrics.F1Score(num_classes=5,average='macro',name='F1_macro'),
              tensorflow_addons.metrics.F1Score(num_classes=5,average='micro',name='F1_micro')])

In [51]:
model2.fit(X_train, y_train_one_hot,epochs=3,validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x279893abc48>

# Evaluating the model

In [52]:
model2_loss,model2_acc,model2_f1macro,model2_f1micro=model2.evaluate(X_test,y_test_one_hot)



In [53]:
#predicting labels for test set 
y_test_pred2=np.argmax(model2.predict(X_test),axis=-1)

In [54]:
#confusion matrix
tf.math.confusion_matrix(y_test['Label'], y_test_pred2, num_classes=5, weights=None, dtype=tf.int32)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[ 535,  398,   12,   26,    4],
       [  92, 1209,  234,  229,    6],
       [   1,  179, 1079,  171,    2],
       [  13,  213,  243, 1449,  111],
       [   2,   22,   12,  382,  785]])>

# Comparing models


In [55]:
cols=['Accuracy','F1_macro','F1_micro']
index=['Multinomial Naive Bayes','Complement Naive Bayes','RNN model 1','RNN model 2']
acc=[acc_MNB_test,acc_CNB_test,model1_acc,model2_acc]
f1macro=[f1macro_MNB_test,f1macro_CNB_test,model1_f1macro,model2_f1macro]
f1micro=[f1micro_MNB_test,f1micro_CNB_test,model1_f1micro,model2_f1micro]
df=pd.DataFrame([acc,f1macro,f1micro],columns=index,index=cols)
df

Unnamed: 0,Multinomial Naive Bayes,Complement Naive Bayes,RNN model 1,RNN model 2
Accuracy,0.458361,0.458361,0.674855,0.682548
F1_macro,0.442086,0.454243,0.681447,0.687096
F1_micro,0.458361,0.447159,0.674855,0.682548
