In [5]:
#Importing data

import warnings
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using TensorFlow backend.

In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

Configuring TPU's
For this version of Notebook we will be using TPU's as we have to built a BERT Model

In [7]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [15]:
train = pd.read_csv("/content/drive/MyDrive/train.tsv", sep='\t')
validation = pd.read_csv("/content/drive/MyDrive/dev.tsv", sep='\t')
test = pd.read_csv("/content/drive/MyDrive/test_task2.tsv", sep='\t')

In [16]:
train = train.rename({'label ': 'label'}, axis=1) #Rename the column label
validation = validation.rename({'label ': 'label'}, axis=1) #Rename the column label

In [17]:
train = train[~train.label.str.contains("others")]
train = train[~train.label.str.contains("disgust")]
train['label'].value_counts()

joy          1270
sadness       706
anger         600
surprise      241
fear           67
Name: label, dtype: int64

In [18]:
validation = validation[~validation.label.str.contains("others")]
validation = validation[~validation.label.str.contains("disgust")]
validation['label'].value_counts()

joy          185
sadness      103
anger         87
surprise      35
fear          10
Name: label, dtype: int64

In [19]:
train.shape

(2884, 3)

We will check the maximum number of words that can be present in a comment , this will help us in padding later

In [20]:
train['tweet'].apply(lambda x:len(str(x).split())).max()

59

In [25]:
label_map = {
    'joy ': 0,
    'sadness ': 1,
    'anger ': 2,
    'surprise ': 3,
    'fear ': 4,
}

train['label'] = train['label'].map(label_map)
train

Unnamed: 0,id,tweet,label
0,1,El Atlético resignado a perder HASHTAG 😔 http...,1
1,2,Leer proporciona una mejor visión del mundo 🤓 ...,0
2,3,Amo a Arya Stark por encima de todas las cosas...,0
4,5,Solo siento que hayamos perdido 24 escaños de ...,1
5,6,Solo con ver con la intensidad que agitan las ...,0
...,...,...,...
5879,5880,IMPOSIBLE DE VER: Melisandre podría estar ya e...,3
5882,5883,¿Sabías que de las dos coronaciones que se cel...,3
5883,5884,Mientras reflexionamos sobre lo duro que es es...,2
5884,5885,La fachada de la catedral de Notre Dame fue “s...,1


In [26]:
label_map = {
    'joy ': 0,
    'sadness ': 1,
    'anger ': 2,
    'surprise ': 3,
    'fear ': 4,
}

validation['label'] = validation['label'].map(label_map)
validation

Unnamed: 0,id,tweet,label
2,3,"La literatura nos hace más empáticos, dispuest...",0
3,4,"Para mi, estas son las 4 mejores escenas de es...",3
4,5,Lo que acabo de ver es puro oro. Historia de l...,0
9,10,"Quien crea que eso es mano del Liverpool, o le...",2
10,11,"Ya tengo el ron, hielo y la Coca-Cola, solo fa...",0
...,...,...,...
846,847,"HASHTAG no más dilatación, no más mentiras, ba...",2
848,849,¿Quién les iba a decir a los HASHTAG de HASHTA...,0
849,850,Gracias a dios ocurrió lo menos malo HASHTAG,0
850,851,Messi .. El mejor de todos los tiempos 🔟 🇦🇷. U...,0


Data Preparation

Writing a function for getting auc score for validation

In [41]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions, pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [28]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.tweet.values, train.label.values, 
                                                  stratify=train.label.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

Simple RNN

Basic Overview
What is a RNN?

Recurrent Neural Network(RNN) are a type of Neural Network where the output from previous step are fed as input to the current step. In traditional neural networks, all the inputs and outputs are independent of each other, but in cases like when it is required to predict the next word of a sentence, the previous words are required and hence there is a need to remember the previous words. Thus RNN came into existence, which solved this issue with the help of a Hidden Layer.


Why RNN's?

https://www.quora.com/Why-do-we-use-an-RNN-instead-of-a-simple-neural-network

Code Implementation

In [29]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 140

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [33]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 140, 300)          3219900   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               40100     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 3,260,101
Trainable params: 3,260,101
Non-trainable params: 0
_________________________________________________________________
CPU times: user 164 ms, sys: 7.39 ms, total: 172 ms
Wall time: 208 ms


In [35]:
model.fit(xtrain_pad, ytrain, epochs=7, batch_size=64*strategy.num_replicas_in_sync) #Multiplying by Strategy to run on TPU's

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f472693be90>

In [43]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))


Auc: 0.58%


In [44]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

Code Explanantion

In an RNN we input a sentence word by word. We represent every word as one hot vectors of dimensions : Numbers of words in Vocab +1.
What keras Tokenizer does is , it takes all the unique words in the corpus,forms a dictionary with words as keys and their number of occurences as values,it then sorts the dictionary in descending order of counts. It then assigns the first value 1 , second value 2 and so on. 

Word Embeddings

While building our simple RNN models we talked about using word-embeddings , So what is word-embeddings and how do we get word-embeddings? Here is the answer :

The latest approach to getting word Embeddings is using pretained GLoVe or using Fasttext. Without going into too much details, I would explain how to create sentence vectors and how can we use them to create a machine learning model on top of it and since I am a fan of GloVe vectors, word2vec and fasttext. In this Notebook, I'll be using the GloVe vectors. You can download the GloVe vectors from here http://www-nlp.stanford.edu/data/glove.840B.300d.zip or you can search for GloVe in datasets on Kaggle and add the file

In [45]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

FileNotFoundError: ignored

LSTM's

Basic Overview
Simple RNN's were certainly better than classical ML algorithms and gave state of the art results, but it failed to capture long term dependencies that is present in sentences . So in 1998-99 LSTM's were introduced to counter to these drawbacks.

Code Implementation
We have already tokenized and paded our text for input to LSTM's

In [46]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 10732/10732 [00:00<00:00, 985814.38it/s]


In [47]:
%%time
with strategy.scope():
    
    # A simple LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

    model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 140, 300)          3219900   
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 3,380,401
Trainable params: 160,501
Non-trainable params: 3,219,900
_________________________________________________________________
CPU times: user 221 ms, sys: 3.1 ms, total: 224 ms
Wall time: 216 ms


In [49]:
model.fit(xtrain_pad, ytrain, epochs=7, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f47269d1790>

In [50]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.51%


In [51]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

GRU's

Basic Overview¶
Introduced by Cho, et al. in 2014, GRU (Gated Recurrent Unit) aims to solve the vanishing gradient problem which comes with a standard recurrent neural network. GRU's are a variation on the LSTM because both are designed similarly and, in some cases, produce equally excellent results . GRU's were designed to be simpler and faster than LSTM's and in most cases produce equally good results and thus there is no clear winne

In [52]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
     model.add(SpatialDropout1D(0.3))
     model.add(GRU(300))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 140, 300)          3219900   
                                                                 
 spatial_dropout1d (SpatialD  (None, 140, 300)         0         
 ropout1D)                                                       
                                                                 
 gru (GRU)                   (None, 300)               540900    
                                                                 
 dense_3 (Dense)             (None, 1)                 301       
                                                                 
Total params: 3,761,101
Trainable params: 541,201
Non-trainable params: 3,219,900
_________________________________________________________________
CPU times: user 237 ms, sys: 14.2 ms, total: 251 ms
Wall time: 231 ms


In [54]:
model.fit(xtrain_pad, ytrain, epochs=7, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f472605bf90>

In [55]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.50%


In [56]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})

In [57]:
scores_model

[{'Model': 'SimpleRNN', 'AUC_Score': 0.5789657752618909},
 {'Model': 'LSTM', 'AUC_Score': 0.5095809746893096},
 {'Model': 'GRU', 'AUC_Score': 0.5}]

Bi-Directional RNN's

Code Implementation

In [58]:
%%time
with strategy.scope():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 140, 300)          3219900   
                                                                 
 bidirectional (Bidirectiona  (None, 600)              1442400   
 l)                                                              
                                                                 
 dense_4 (Dense)             (None, 1)                 601       
                                                                 
Total params: 4,662,901
Trainable params: 1,443,001
Non-trainable params: 3,219,900
_________________________________________________________________
CPU times: user 390 ms, sys: 9.7 ms, total: 399 ms
Wall time: 375 ms


In [59]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4721e59e90>

In [60]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.50%


In [61]:
scores_model.append({'Model': 'Bi-directional LSTM','AUC_Score': roc_auc(scores,yvalid)})

In [62]:
# Visualization of Results obtained from various Deep learning models
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score',ascending=False)
results.style.background_gradient(cmap='Blues')

Unnamed: 0,Model,AUC_Score
0,SimpleRNN,0.578966
1,LSTM,0.509581
3,Bi-directional LSTM,0.501147
2,GRU,0.5


In [63]:
fig = go.Figure(go.Funnelarea(
    text =results.Model,
    values = results.AUC_Score,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()