In [1]:
#Importing data

import warnings
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using TensorFlow backend.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [3]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
train = pd.read_csv("/content/drive/MyDrive/train.tsv", sep='\t')
validation = pd.read_csv("/content/drive/MyDrive/dev.tsv", sep='\t')
test = pd.read_csv("/content/drive/MyDrive/test_task2.tsv", sep='\t')

In [5]:
train = train.rename({'label ': 'label'}, axis=1) #Rename the column label
validation = validation.rename({'label ': 'label'}, axis=1) #Rename the column label

In [6]:
train = train[~train.label.str.contains("others")]
train = train[~train.label.str.contains("disgust")]
train['label'].value_counts()

joy          1270
sadness       706
anger         600
surprise      241
fear           67
Name: label, dtype: int64

In [None]:
validation = validation[~validation.label.str.contains("others")]
validation = validation[~validation.label.str.contains("disgust")]
validation['label'].value_counts()

joy          185
sadness      103
anger         87
surprise      35
fear          10
Name: label, dtype: int64

# ***Preprocessing***

In [7]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import nltk

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
import re,string

In [11]:
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

In [12]:
def strip_all_entities(text):
    entity_prefixes = ['@','#', '¿', '¡']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [13]:
train['tweet'] = train['tweet'].apply(lambda x: strip_links(x))

In [14]:
train['tweet'] = train['tweet'].apply(lambda x: strip_all_entities(x))

In [15]:
train['tweet'] = train['tweet'].str.replace(r'HASHTAG', '', regex=True)
train['tweet'] = train['tweet'].str.replace(r'USER', '', regex=True)

In [16]:
def convert_to_lower(text):
    return text.lower()

In [17]:
train['tweet'] = train['tweet'].apply(lambda x: convert_to_lower(x))

In [18]:
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

In [19]:
train['tweet'] = train['tweet'].apply(lambda x: remove_numbers(x))

In [20]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [21]:
train['tweet'] = train['tweet'].apply(lambda x: remove_punctuation(x))

In [22]:
from nltk import word_tokenize #method that will perform text tokenization.

In [23]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
stopword_es = nltk.corpus.stopwords.words('spanish')
stopword = stopword_es

def remove_stopwords(text):
    removed = []
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stopword:
            removed.append(tokens[i])
    return " ".join(removed)

In [25]:
train['tweet'] = train['tweet'].apply(lambda x: remove_stopwords(x))

In [26]:
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [27]:
train['tweet'] = train['tweet'].apply(lambda x: remove_extra_white_spaces(x))

In [28]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
import emoji

In [30]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                    u"\U0001F600-\U0001F64F" #emoticons
                    u"\U0001F300-\U0001F5FF" #symbols and pictographs
                    u"\U0001F680-\U0001F6FF" #transport and map symbols
                    u"\U0001F1E0-\U0001F1FF" #flags (ios)
                    u"\U00002702-\U000027B0" 
                    u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [31]:
train['tweet'] = train['tweet'].apply(lambda x: remove_emoji(x))

In [32]:
pip install clean-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
from cleantext import clean
train['tweet'] = train['tweet'].apply(lambda x: clean(x, no_emoji = True))



In [34]:
label_map = {
    'joy ': 0,
    'sadness ': 1,
    'anger ': 2,
    'surprise ': 3,
    'fear ': 4,
}

train['label'] = train['label'].map(label_map)
train

Unnamed: 0,id,tweet,label
0,1,atletico resignado perder,1
1,2,leer proporciona mejor vision mundo,0
2,3,amo arya stark encima todas cosas gameofthrones,0
4,5,solo siento perdido escanos cordura,1
5,6,solo ver intensidad agitan banderas ve quedado...,0
...,...,...,...
5879,5880,imposible ver melisandre podria invernalia teo...,3
5882,5883,dos coronaciones celebraron napoleon unico cas...,3
5883,5884,mientras reflexionamos duro medio nueva guerra...,2
5884,5885,"fachada catedral notre dame "" salvada "" podra ...",1


Data Preparation

In [35]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [36]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.tweet.values, train.label.values, 
                                                  stratify=train.label.values, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

# ***Simple RNN***

In [39]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 140

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m,precision_m, recall_m])
    
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 140, 300)          2391900   
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 2,432,101
Trainable params: 2,432,101
Non-trainable params: 0
_________________________________________________________________
CPU times: user 209 ms, sys: 50.7 ms, total: 259 ms
Wall time: 477 ms


In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync) #Multiplying by Strategy to run on TPU's

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff501a7c9d0>

LSTM's

In [None]:
%%time
with strategy.scope():
    
    # A simple LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

    model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m,precision_m, recall_m])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 140, 300)          2391900   
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 2,552,401
Trainable params: 160,501
Non-trainable params: 2,391,900
_________________________________________________________________
CPU times: user 256 ms, sys: 4.8 ms, total: 261 ms
Wall time: 253 ms


In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff4feb20b90>

GRU's

In [None]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
     model.add(SpatialDropout1D(0.3))
     model.add(GRU(300))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m,precision_m, recall_m]) 
    
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 140, 300)          2391900   
                                                                 
 spatial_dropout1d (SpatialD  (None, 140, 300)         0         
 ropout1D)                                                       
                                                                 
 gru (GRU)                   (None, 300)               540900    
                                                                 
 dense_2 (Dense)             (None, 1)                 301       
                                                                 
Total params: 2,933,101
Trainable params: 541,201
Non-trainable params: 2,391,900
_________________________________________________________________
CPU times: user 266 ms, sys: 5.07 ms, total: 271 ms
Wall time: 254 ms


In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff4ff56d050>

Bi-Directional RNN's

In [41]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))

In [42]:
%%time
with strategy.scope():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy',f1_m,precision_m, recall_m]) 
    
    
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 140, 300)          2391900   
                                                                 
 bidirectional (Bidirectiona  (None, 600)              1442400   
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 601       
                                                                 
Total params: 3,834,901
Trainable params: 1,443,001
Non-trainable params: 2,391,900
_________________________________________________________________
CPU times: user 662 ms, sys: 28.5 ms, total: 690 ms
Wall time: 826 ms


In [43]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd7504accd0>