**Notebook Objective:**

Objective of the notebook is to look at the different pretrained embeddings provided in the dataset and to see how they are useful in the model building process. 

First let us import the necessary modules and read the input data.

In [2]:
%tensorflow_version 2.x

In [3]:
#!pip install --upgrade keras

In [4]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.engine import InputSpec, Layer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,  Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D,GlobalMaxPooling1D,GlobalAveragePooling1D
from keras.models import Model
from keras.optimizers import Adam
from keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.python.keras.layers import CuDNNGRU,CuDNNLSTM

In [5]:
from keras.layers import SpatialDropout1D,Attention,concatenate,BatchNormalization,Reshape,Lambda
from keras import backend as K

In [6]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

In [6]:
#Different Models - Model 1 ,2 and 3

In [7]:
def load_and_preprocess():  
  train_df = pd.read_csv("/content/sample_data/train.csv")
  print("Train shape : ",train_df.shape)
  ## split to train and val
  train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

  ## fill up the missing values
  train_X = train_df["question_text"].fillna("_na_").values
  val_X = val_df["question_text"].fillna("_na_").values

  ## Tokenize the sentences
  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(list(train_X))
  train_X = tokenizer.texts_to_sequences(train_X)
  val_X = tokenizer.texts_to_sequences(val_X)

  ## Pad the sentences 
  train_X = pad_sequences(train_X, maxlen=maxlen)
  val_X = pad_sequences(val_X, maxlen=maxlen)

  ## Get the target values
  train_y = train_df['target'].values
  val_y = val_df['target'].values

  return train_X,val_X,train_y,val_y,tokenizer.word_index

In [8]:
def LSTM_GRU(embedding_matrix,spatialdropout=0.2, rnn_units=64, weight_decay=0.07):
  K.clear_session()
  x_input = Input(shape=(maxlen,))
  
  emb = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False, name='Embedding')(x_input)

  x = SpatialDropout1D(spatialdropout, seed=1024)(emb)

  rnn1 = Bidirectional(CuDNNLSTM(rnn_units, return_sequences=True, kernel_initializer=initializers.glorot_uniform(seed=111100), recurrent_initializer=initializers.Orthogonal(gain=1.0, seed=123000)))(x)

  rnn2 = Bidirectional(CuDNNGRU(rnn_units, return_sequences=True, kernel_initializer=initializers.glorot_uniform(seed=111100), recurrent_initializer=initializers.Orthogonal(gain=1.0, seed=123000)))(rnn1)

  x = concatenate([rnn1, rnn2])
  x = GlobalMaxPooling1D()(x)
  x_output = Dense(1, activation='sigmoid', kernel_initializer=initializers.glorot_uniform(seed=111100))(x)

  model = Model(inputs=x_input, outputs=x_output)
  model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
  return model

Next steps are as follows:
 * Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
 * Fill up the missing values in the text column with '_na_'
 * Tokenize the text column and convert them to vector sequences
 * Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [9]:
!wget 'https://nlp.stanford.edu/data/glove.840B.300d.zip'

--2020-11-23 18:17:29--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-11-23 18:17:30--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2020-11-23 18:34:25 (2.04 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]



In [10]:
!unzip 'glove.840B.300d.zip'

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [11]:
!rm 'glove.840B.300d.zip'

In [10]:
def load_glove(word_index):
  EMBEDDING_FILE = 'glove.840B.300d.txt'
  def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
  embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

  all_embs = np.stack(embeddings_index.values())
  emb_mean,emb_std = all_embs.mean(), all_embs.std()
  embed_size = all_embs.shape[1]

  #word_index = tokenizer.word_index
  nb_words = min(max_features, len(word_index))
  embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
  for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
  return embedding_matrix

We have four different types of embeddings.
 * GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
 * glove.840B.300d - https://nlp.stanford.edu/projects/glove/
 * paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
 * wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html
 

**Glove Embeddings:**

In this section, let us use the Glove embeddings and rebuild the various models.

In [13]:
train_X,val_X,train_y,val_y,word_index = load_and_preprocess()
emb_matrix1= load_glove(word_index)

Train shape :  (1317095, 3)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
model = LSTM_GRU(emb_matrix1)
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f8f673e1080>

In [15]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5825047300833299
F1 score at threshold 0.11 is 0.5925589461948219
F1 score at threshold 0.12 is 0.6020723197293296
F1 score at threshold 0.13 is 0.6093446654299151
F1 score at threshold 0.14 is 0.6174184197812815
F1 score at threshold 0.15 is 0.6237415544319657
F1 score at threshold 0.16 is 0.6283012864221101
F1 score at threshold 0.17 is 0.633581090439038
F1 score at threshold 0.18 is 0.6382918817456593
F1 score at threshold 0.19 is 0.6416424522459867
F1 score at threshold 0.2 is 0.6449712713051036
F1 score at threshold 0.21 is 0.6477895148669797
F1 score at threshold 0.22 is 0.6503043198574892
F1 score at threshold 0.23 is 0.6519298245614035
F1 score at threshold 0.24 is 0.6538344337227019
F1 score at threshold 0.25 is 0.6557090759669217
F1 score at threshold 0.26 is 0.6586328693122793
F1 score at threshold 0.27 is 0.6602553996531609
F1 score at threshold 0.28 is 0.6617772108843537
F1 score at threshold 0.29 is 0.6627632144391921
F1 score at threshold 0

Results seem to be better than the model without pretrained embeddings.

In [16]:
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip'

--2020-11-23 18:47:50--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2020-11-23 18:49:13 (8.03 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]



In [17]:
!unzip 'wiki-news-300d-1M.vec.zip'

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [18]:
!rm 'wiki-news-300d-1M.vec.zip'

**Wiki News FastText Embeddings:**

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

In [11]:
def load_fasttext(word_index):
  EMBEDDING_FILE = 'wiki-news-300d-1M.vec'
  def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
  embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

  all_embs = np.stack(embeddings_index.values())
  emb_mean,emb_std = all_embs.mean(), all_embs.std()
  embed_size = all_embs.shape[1]

  #word_index = tokenizer.word_index
  nb_words = min(max_features, len(word_index))
  embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
  for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
  return embedding_matrix

In [20]:
train_X,val_X,train_y,val_y,word_index = load_and_preprocess()
emb_matrix2 = load_fasttext(word_index)

Train shape :  (1317095, 3)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
model = LSTM_GRU(emb_matrix2)
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f8e4d2a2278>

In [22]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5857629371210235
F1 score at threshold 0.11 is 0.5957192292903831
F1 score at threshold 0.12 is 0.6051592524348514
F1 score at threshold 0.13 is 0.6134649910233393
F1 score at threshold 0.14 is 0.6199816681943172
F1 score at threshold 0.15 is 0.624959087296021
F1 score at threshold 0.16 is 0.6309915581628273
F1 score at threshold 0.17 is 0.6348189144577143
F1 score at threshold 0.18 is 0.638430057689463
F1 score at threshold 0.19 is 0.6424552943540285
F1 score at threshold 0.2 is 0.6452730236348818
F1 score at threshold 0.21 is 0.6478829550741871
F1 score at threshold 0.22 is 0.6506719865602688
F1 score at threshold 0.23 is 0.6516506922257721
F1 score at threshold 0.24 is 0.6533340520726646
F1 score at threshold 0.25 is 0.655598033861278
F1 score at threshold 0.26 is 0.6551705079312441
F1 score at threshold 0.27 is 0.6562464985994397
F1 score at threshold 0.28 is 0.6566859476012249
F1 score at threshold 0.29 is 0.6571428571428571
F1 score at threshold 0.3


 * The performance of the different pretrained embeddings are almost similar.
 
**Final Blend:**

Though the results of the models with different pre-trained embeddings are similar, there is a good chance that they might capture different type of information from the data. So let us do a blend of these three models by averaging their predictions.

In [23]:
pred_val_y = 0.67*pred_glove_val_y + 0.33*pred_fasttext_val_y
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5857536913840662
F1 score at threshold 0.11 is 0.5966211717709721
F1 score at threshold 0.12 is 0.6064323748668797
F1 score at threshold 0.13 is 0.6146277753591641
F1 score at threshold 0.14 is 0.6209527188858335
F1 score at threshold 0.15 is 0.6283777677361049
F1 score at threshold 0.16 is 0.6326774549466715
F1 score at threshold 0.17 is 0.6375023368853991
F1 score at threshold 0.18 is 0.641462835431014
F1 score at threshold 0.19 is 0.6469509841759937
F1 score at threshold 0.2 is 0.6489871807417557
F1 score at threshold 0.21 is 0.65228602598433
F1 score at threshold 0.22 is 0.6554325955734406
F1 score at threshold 0.23 is 0.6576208936951642
F1 score at threshold 0.24 is 0.6600475747233427
F1 score at threshold 0.25 is 0.6627529091099696
F1 score at threshold 0.26 is 0.663236074270557
F1 score at threshold 0.27 is 0.6644144144144144
F1 score at threshold 0.28 is 0.664930272939389
F1 score at threshold 0.29 is 0.6669229079727651
F1 score at threshold 0.3 i

The result seems to better than individual pre-trained models and so we let us 
create a submission file using this model blend.Let us try the 2 embeddings with different models.

In [8]:
def BiLSTM_CNN(embedding_matrix,spatialdropout=0.2, rnn_units=128, filters=[100, 80, 30, 12], weight_decay=0.10):
  K.clear_session()
  x_input = Input(shape=(maxlen,))
  
  emb = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False, name='Embedding')(x_input)

  x = SpatialDropout1D(rate=spatialdropout, seed=10000)(emb)

  rnn = Bidirectional(CuDNNLSTM(rnn_units, return_sequences=True, kernel_initializer=initializers.glorot_uniform(seed=123000), recurrent_initializer=initializers.Orthogonal(gain=1.0, seed=123000)))(x)
  
  x1 = Conv1D(filters=filters[0], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=110000))(rnn)
  x2 = Conv1D(filters=filters[1], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=120000))(rnn)
  x3 = Conv1D(filters=filters[2], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=130000))(rnn)
  x4 = Conv1D(filters=filters[3], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=140000))(rnn)

  x1 = GlobalMaxPooling1D()(x1)
  x2 = GlobalMaxPooling1D()(x2)
  x3 = GlobalMaxPooling1D()(x3)
  x4 = GlobalMaxPooling1D()(x4)

  c = concatenate([x1, x2, x3, x4])
  x = Dense(200, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=111000))(c)
  x = Dropout(0.2, seed=10000)(x)
  x = BatchNormalization()(x)
  x_output = Dense(1, activation='sigmoid', kernel_initializer=initializers.glorot_uniform(seed=110000))(x)
 
  model = Model(inputs=x_input, outputs=x_output)
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [12]:
train_X,val_X,train_y,val_y,word_index = load_and_preprocess()
emb_matrix3= load_glove(word_index)
model = BiLSTM_CNN(emb_matrix3)

Train shape :  (1317095, 3)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f1b147e4f98>

In [14]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.562726890595743
F1 score at threshold 0.11 is 0.5728383458646616
F1 score at threshold 0.12 is 0.5814586502281643
F1 score at threshold 0.13 is 0.5904403339335407
F1 score at threshold 0.14 is 0.5984895898527142
F1 score at threshold 0.15 is 0.6049534814563066
F1 score at threshold 0.16 is 0.611545945945946
F1 score at threshold 0.17 is 0.6182746029651138
F1 score at threshold 0.18 is 0.624151482672383
F1 score at threshold 0.19 is 0.629167611703334
F1 score at threshold 0.2 is 0.6344155246941966
F1 score at threshold 0.21 is 0.6390146036485793
F1 score at threshold 0.22 is 0.6428132532967813
F1 score at threshold 0.23 is 0.6471123020240204
F1 score at threshold 0.24 is 0.6497142303593917
F1 score at threshold 0.25 is 0.6524697704019189
F1 score at threshold 0.26 is 0.6549093431090854
F1 score at threshold 0.27 is 0.6571928946710033
F1 score at threshold 0.28 is 0.6592082616179001
F1 score at threshold 0.29 is 0.6621282694374776
F1 score at threshold 0.3 

In [15]:
train_X,val_X,train_y,val_y,word_index = load_and_preprocess()
emb_matrix4 = load_fasttext(word_index)
model = BiLSTM_CNN(emb_matrix4)

Train shape :  (1317095, 3)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [16]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f18fb30ea20>

In [17]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6119603110753553
F1 score at threshold 0.11 is 0.620252583237658
F1 score at threshold 0.12 is 0.6271514122695336
F1 score at threshold 0.13 is 0.6334636359249072
F1 score at threshold 0.14 is 0.6378010935421901
F1 score at threshold 0.15 is 0.6433792025807752
F1 score at threshold 0.16 is 0.6482673394778847
F1 score at threshold 0.17 is 0.6506137865911238
F1 score at threshold 0.18 is 0.6525967086984399
F1 score at threshold 0.19 is 0.6531788763800511
F1 score at threshold 0.2 is 0.6536628615759725
F1 score at threshold 0.21 is 0.6546336510801513
F1 score at threshold 0.22 is 0.6551013397457918
F1 score at threshold 0.23 is 0.6544186046511629
F1 score at threshold 0.24 is 0.6533648170011805
F1 score at threshold 0.25 is 0.6527286886718985
F1 score at threshold 0.26 is 0.6524908869987849
F1 score at threshold 0.27 is 0.6512716300264796
F1 score at threshold 0.28 is 0.6483180428134557
F1 score at threshold 0.29 is 0.6455240381578116
F1 score at threshold 0