# Deep learning for sentiment analysis
**Task**
- Preprocessing data
- Load pretrained embedding
- Build model
- Optimize model with lr_scheduler, early stopping

# Download data

## Dataset

In [1]:
!gdown --id 1WcbL14nk0kJ1L89Dyy8qW9rl6KM3g5lw
!gdown --id 1qOS5qiFOiDYiLUSWXk77jLqvbfLt5Mqn

Downloading...
From: https://drive.google.com/uc?id=1WcbL14nk0kJ1L89Dyy8qW9rl6KM3g5lw
To: /content/test.tsv
100% 3.37M/3.37M [00:00<00:00, 109MB/s]
Downloading...
From: https://drive.google.com/uc?id=1qOS5qiFOiDYiLUSWXk77jLqvbfLt5Mqn
To: /content/train.tsv
100% 8.48M/8.48M [00:00<00:00, 32.2MB/s]


## Word Embedding [Source](https://fasttext.cc/docs/en/english-vectors.html)

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip

--2021-11-03 13:54:43--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2021-11-03 13:55:35 (28.2 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


# Import requirements

In [3]:
import os
import tqdm
import numpy as np  
import pandas as pd 

import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import *
from keras.callbacks import EarlyStopping
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [4]:
# To get reproducible results
np.random.seed(0)
tf.random.set_seed(0)

# Preprocessing data

In [5]:
test_file = 'test.tsv'
test = pd.read_csv(test_file, delimiter='\t').fillna('')
x_test = test.values[:, 2]

train_file = 'train.tsv'
train = pd.read_csv(train_file, delimiter='\t').fillna('')
x_train = train.values[:, 2]
y_train = train.values[:, 3]

print('x_test count: {}'.format(len(x_test)))
print('x_train count: {}'.format(len(x_train)))
print('y_train count: {}'.format(len(y_train)))

x_test count: 66292
x_train count: 156060
y_train count: 156060


In [6]:
max_length = 60
max_features = 20000

x_all = []
x_all.extend(x_test)
x_all.extend(x_train)

np_x_train = None  # ndarray, token sequences
np_x_test = None # ndarray, token sequences
np_y_train = None # # ndarray, label index

tk = Tokenizer(num_words=max_features, lower=True, filters='\n\t')
# PREPROCESS DATA
# START CODE HERE

tk.fit_on_texts(x_all)
x_train_seq = tk.texts_to_sequences(x_train)
x_test_seq = tk.texts_to_sequences(x_test)

np_x_train = pad_sequences(x_train_seq, maxlen=max_length,  padding='post')
np_x_test = pad_sequences(x_test_seq, maxlen=max_length,  padding='post')
np_y_train = to_categorical(y_train)

# END CODE HERE

    #                                                           #
    #  viết các câu lệnh để in ra EXPECTED OUTPUT như bên dưới  #
    #                                                           # 
print ('np_x_train shape: {}'.format(np_x_train.shape))
print ('np_x_test shape: {}'.format(np_x_test.shape))
print ('np_y_train shape: {}'.format(np_y_train.shape))
## EXPECTED OUTPUT:
# np_x_train shape: (156060, 60)
# np_x_test shape: (66292, 60)
# np_y_train shape: (156060, 5)

np_x_train shape: (156060, 60)
np_x_test shape: (66292, 60)
np_y_train shape: (156060, 5)


# Load pretrained embedding metrix

In [7]:
word_dict = tk.word_index
embedding_dim = 300
embeddings_index = {}

with open('crawl-300d-2M.vec', 'r') as f:
    lines = f.readlines()

max_features = min(max_features, len(word_dict) + 1)
embedding_matrix = np.zeros((max_features, embedding_dim))

## LOAD EMBEDDING METRIX TO embedding_matrix
# START CODE HERE

for i in tqdm.tqdm(range(len(lines))):
    values = lines[i].rstrip().rsplit(' ')
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')

for word, i in word_dict.items():
    if i >= max_features:
        break

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# END CODE HERE
print('Embedding matrix: {}'.format(embedding_matrix.shape))
## EXPECTED OUTPUT:
# Embedding matrix: (19479, 300)

100%|██████████| 1999996/1999996 [02:27<00:00, 13525.31it/s]

Embedding matrix: (19479, 300)





# Build model

In [8]:
def one_input_classifier(index, input_length, max_features, class_num, embedding_dim, embedding_matrix):
    inputs = Input(shape=(input_length,), name='input_1')
    embeddings = Embedding(max_features, embedding_dim,
                           weights=[embedding_matrix], input_length=input_length,
                           trainable=False, name='embedding_1')(inputs)

    ## DEFINE YOUR MODEL (Functional API)
    # START CODE HERE
    x = SpatialDropout1D(0.3, name='spatial_dropout1d_1')(embeddings)

    x = Bidirectional(CuDNNLSTM(128, name='lstm_1', return_sequences=True), name='bidirectional_1')(x)
    #                                                                                              #
    #  các bạn viết tiếp đoạn này, tham khảo https://www.kaggle.com/antmarakis/bi-lstm-conv-layer  #
    #                                                                                              # 
    ## END CODE HERE
    x = Dropout(0.25, name='dropout_1')(x)
    x = Conv1D(128, 5, activation='relu', name='conv1d_1')(x)
    x = Conv1D(128, 3, activation='relu', name='conv1d_2')(x)
    x = Conv1D(128, 1, activation='relu', name='conv1d_3')(x)
    x = Dropout(0.25, name='dropout_2')(x)

    x = GlobalMaxPooling1D(name='global_maxpool1d_1')(x)
    x = Dense(32, activation='relu', name='dense_1')(x)
    x = Dropout(0.25, name='dropout_5')(x)

    preds = Dense(class_num, activation='softmax', name='preds')(x)

    model = Model(inputs=inputs, outputs=preds, name='model_{}'.format(index))
    model.compile(optimizer='nadam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


class_num = np_y_train.shape[1]
epochs = 32
batch_size = 1024
validation_split = 0.2
classifier_num = 1

print('Classes: {}'.format(class_num))
print('Epochs: {}'.format(epochs))
print('Batch size: {}'.format(batch_size))
print('Validation split: {:.1}'.format(validation_split))
print('Classifiers: {}'.format(classifier_num))

# EXPECTED OUTPUT: 
# Classes: 5
# Epochs: 32
# Batch size: 1024
# Validation split: 0.2
# Classifiers: 10

Classes: 5
Epochs: 32
Batch size: 1024
Validation split: 0.2
Classifiers: 1


# Training model

In [9]:
classifiers = []
model_index = list(range(classifier_num))  # for ensemble only, kfold split - do it yourself

for i in model_index:
    classifier = one_input_classifier(i, max_length, max_features, class_num, embedding_dim, embedding_matrix)

    if i == 0:
        classifier.summary()

    hist = classifier.fit(np_x_train, np_y_train, validation_split=validation_split, shuffle=True,
                          callbacks=[], epochs=epochs, batch_size=batch_size, verbose=1)
    classifier.trainable = False
    classifiers.append(classifier)

    print('min loss ({}): {:.4}'.format(i, min(hist.history['loss'])))
    print('min val_loss ({}): {:.4}'.format(i, min(hist.history['val_loss'])))
    print('max accuracy ({}): {:.4}'.format(i, max(hist.history['accuracy'])))
    print('max val_accuracy ({}): {:.4}'.format(i, max(hist.history['val_accuracy'])))

# EXPECTED VALID ACC FOR SINGLE MODEL >= 0.64

Model: "model_0"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 60)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 60, 300)           5843700   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 60, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 60, 256)           440320    
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 256)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 56, 128)           163968    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 54, 128)           4928

## Optimize with lr_scheduler and early stopping

In [10]:
lr_scheduler = None
early_stopping = None
# DEFINE YOUR lr_scheduler AND early_stopping callbacks
# START CODE HERE
def scheduler(epoch, lr):
    # EXPERIMENT YOURSELF
    decay_rate = 0.1
    decay_step = 90
    if epoch % decay_step == 0 and epoch:
        return lr * decay_rate
    return lr


lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1,
                               mode='min', baseline=None, restore_best_weights=True)
# END CODE HERE

In [11]:
classifiers = []
model_index = list(range(classifier_num))  # for ensemble, kfold split - do it yourself

for i in model_index:
    classifier = one_input_classifier(i, max_length, max_features, class_num, embedding_dim, embedding_matrix)

    if i == 0:
        classifier.summary()

    hist = classifier.fit(np_x_train, np_y_train, validation_split=validation_split, shuffle=True,
                          callbacks=[early_stopping, lr_scheduler], epochs=epochs, batch_size=batch_size, verbose=1)
    classifier.trainable = False
    classifiers.append(classifier)

    print('min loss ({}): {:.4}'.format(i, min(hist.history['loss'])))
    print('min val_loss ({}): {:.4}'.format(i, min(hist.history['val_loss'])))
    print('max accuracy ({}): {:.4}'.format(i, max(hist.history['accuracy'])))
    print('max val_accuracy ({}): {:.4}'.format(i, max(hist.history['val_accuracy'])))


Model: "model_0"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 60)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 60, 300)           5843700   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 60, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 60, 256)           440320    
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 256)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 56, 128)           163968    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 54, 128)           4928

# Inference

In [12]:
y_pred_list = []

for i in range(classifier_num):
    y_pred = classifiers[i].predict(np_x_test, batch_size=1024, verbose=1)
    y_pred_list.append(y_pred)



In [13]:
test_num = np_x_test.shape[0]
y_pred_class = np.ndarray(shape=(test_num,), dtype=np.int32)

for i in range(test_num):
    votes = []

    for j in range(classifier_num):
        vote = y_pred_list[j][i].argmax(axis=0).astype(int)
        votes.append(vote)

    vote_final = max(set(votes), key=votes.count)
    y_pred_class[i] = vote_final

mapping = {phrase: sentiment for _, _, phrase, sentiment in train.values}

# Overlapping
for i, phrase in enumerate(test.Phrase.values):
    if phrase in mapping:
        y_pred_class[i] = mapping[phrase]

test['Sentiment'] = y_pred_class
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,3
1,156062,8545,An intermittently pleasing but mostly routine ...,3
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,3
4,156065,8545,intermittently pleasing but mostly routine,3
