In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import pandas as pd

f_ = drive.CreateFile({'id': '1UJDXBrLmpfFL1C9mAdxi-klHs48gtHTv'})
f_.GetContentFile('train.csv')
df = pd.read_csv('train.csv')

In [0]:
# Resampling and train-validation split
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [0]:
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
# Preprocessing
def filter_numerical(sentence):
  return re.sub(r'\b\d+(?:\.\d+)?\s+', '', sentence)

def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def filter_stopwords(sentence):
  return [word for word in sentence if word not in stopwords]

def lemmatize_text(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in sentence]

def stem_text(sentence):
    ps = PorterStemmer()
    return [ps.stem(word) for word in sentence]

def preprocess_text(df):
    df['processed'] = df['question_text'].apply(filter_numerical)
    df['processed'] = df['processed'].apply(remove_punctuation)
    df['processed'] = df['processed'].apply(word_tokenize)
    df['processed'] = df['processed'].apply(filter_stopwords)
    df['processed'] = df['processed'].apply(lemmatize_text)
    df['processed'] = df['processed'].apply(stem_text)
    return df

train_df = preprocess_text(train_df)
val_df = preprocess_text(val_df)

In [0]:
train_df['question_len'] = train_df.processed.apply(len)
val_df['question_len'] = val_df.processed.apply(len)
# Keep only adequate length
train_df = train_df[(train_df.question_len > 3) & (train_df.question_len < 80)]
val_df = val_df[(val_df.question_len > 3) & (val_df.question_len < 80)]

In [0]:
# Some initial parameters
embed_size = 100
max_features = 50000
maxlen = 80

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

train_X = list(train_df.processed.values)
val_X = list(val_df.processed.values)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_X)
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)

Using TensorFlow backend.


In [0]:
train_y = train_df['target'].values
val_y = val_df['target'].values

## Learn embeddings

In [0]:
import numpy as np
from sklearn.metrics import f1_score
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [0]:
# LSTM

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(layers.LSTM(64))(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)

model_lstm = Model(inputs=inp, outputs=x)

model_lstm.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [0]:
model_lstm.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 80, 100)           5000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 5,086,561
Trainable params: 5,086,561
Non-trainable params: 0
_________________________________________________

In [0]:
model_lstm.fit(train_X, train_y, batch_size=64, epochs=1, validation_data=(val_X, val_y))

Train on 1183821 samples, validate on 62316 samples
Epoch 1/1


<keras.callbacks.History at 0x7f6f79fa2f60>

In [0]:
prob_lstm = model_lstm.predict([val_X], batch_size=1024, verbose=1)
thresholds = dict()
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f_sc = f1_score(val_y, (prob_lstm > thresh).astype(int))
    thresholds[f_sc] = thresh
    print("F1 score at threshold {0} is {1}".format(thresh, f_sc))

F1 score at threshold 0.1 is 0.5297642206832154
F1 score at threshold 0.11 is 0.5392469687300574
F1 score at threshold 0.12 is 0.5449052906596996
F1 score at threshold 0.13 is 0.5510866849862603
F1 score at threshold 0.14 is 0.5562925170068027
F1 score at threshold 0.15 is 0.5611087050671286
F1 score at threshold 0.16 is 0.5659679408138101
F1 score at threshold 0.17 is 0.572223219092051
F1 score at threshold 0.18 is 0.5753349740224227
F1 score at threshold 0.19 is 0.5797369883311725
F1 score at threshold 0.2 is 0.583607791474546
F1 score at threshold 0.21 is 0.5868446139180171
F1 score at threshold 0.22 is 0.5903567984570878
F1 score at threshold 0.23 is 0.5956406998338384
F1 score at threshold 0.24 is 0.5995047052996534
F1 score at threshold 0.25 is 0.6029323157260494
F1 score at threshold 0.26 is 0.6044950676294111
F1 score at threshold 0.27 is 0.6069774621796851
F1 score at threshold 0.28 is 0.6095138525875589
F1 score at threshold 0.29 is 0.612184249628529
F1 score at threshold 0.3

In [0]:
# Max F1 score
max(thresholds.keys())

0.6196467736349994

In [0]:
# Save the model
model_lstm.save_weights("model_lstm.h5")

from google.colab import files
files.download("model_lstm.h5")

In [0]:
# GRU network

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.25)(x)
x = Dense(1, activation="sigmoid")(x)
model_gru = Model(inputs=inp, outputs=x)
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model_gru.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 80, 100)           5000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 80, 128)           63744     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17  

In [0]:
model_gru.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1183821 samples, validate on 62316 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6f7a5b90f0>

In [0]:
prob_gru = model_gru.predict([val_X], batch_size=1024, verbose=1)
thresholds = dict()
for thresh in np.arange(0.1, 0.99, 0.01):
    thresh = np.round(thresh, 2)
    f_sc = f1_score(val_y, (prob_gru > thresh).astype(int))
    thresholds[f_sc] = thresh
    print("F1 score at threshold {0} is {1}".format(thresh, f_sc))

F1 score at threshold 0.1 is 0.5691815746262207
F1 score at threshold 0.11 is 0.5765175011076651
F1 score at threshold 0.12 is 0.582035494386092
F1 score at threshold 0.13 is 0.5879095523765575
F1 score at threshold 0.14 is 0.5940203083866115
F1 score at threshold 0.15 is 0.5989867125513814
F1 score at threshold 0.16 is 0.602971162248762
F1 score at threshold 0.17 is 0.6055588409225311
F1 score at threshold 0.18 is 0.6095200079832351
F1 score at threshold 0.19 is 0.6128444084278768
F1 score at threshold 0.2 is 0.61693175987686
F1 score at threshold 0.21 is 0.6206466368645389
F1 score at threshold 0.22 is 0.6229508196721312
F1 score at threshold 0.23 is 0.6238726790450929
F1 score at threshold 0.24 is 0.6250536250536252
F1 score at threshold 0.25 is 0.6273446817738263
F1 score at threshold 0.26 is 0.6298936286873561
F1 score at threshold 0.27 is 0.6303312285366124
F1 score at threshold 0.28 is 0.6325226941611565
F1 score at threshold 0.29 is 0.6332274171584202
F1 score at threshold 0.3 

In [0]:
# Max F1 score
max(thresholds.keys())

0.6393422505828935

In [0]:
# Save the model
model_gru.save_weights("model_gru.h5")

from google.colab import files
files.download("model_gru.h5")

## Use pretrained embeddings



In [0]:
import gensim 
import numpy as np

In [0]:
documents = train_df.processed.to_list()

w2v_model = gensim.models.Word2Vec (documents, size=embed_size, window=12, min_count=5, workers=10)
w2v_model.train(documents,total_examples=len(documents),epochs=10)

(76740014, 89349820)

In [0]:
embedding_matrix = np.zeros((max_features, embed_size))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    try:
      embedding_vector = w2v_model.wv[word]
      embedding_matrix[i] = embedding_vector
    except KeyError:
      continue

In [0]:
# Sanity check
np.array_equal(embedding_matrix[4], w2v_model.wv['how'])

True

In [0]:
# GRU network

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
gru_w2v = Model(inputs=inp, outputs=x)
gru_w2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(gru_w2v.summary())







Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 80, 100)           5000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 80, 128)           63744     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)               

In [0]:
gru_w2v.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))



Train on 1121542 samples, validate on 124595 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f967f372ac8>

In [0]:
prob_gru_emb = gru_w2v.predict([val_X], batch_size=1024, verbose=1)
thresholds = dict()
for thresh in np.arange(0.1, 0.99, 0.01):
    thresh = np.round(thresh, 2)
    f_sc = f1_score(val_y, (prob_gru_emb > thresh).astype(int))
    thresholds[f_sc] = thresh
    print("F1 score at threshold {0} is {1}".format(thresh, f_sc))

F1 score at threshold 0.1 is 0.5383955049164977
F1 score at threshold 0.11 is 0.5493369656664397
F1 score at threshold 0.12 is 0.55848313223344
F1 score at threshold 0.13 is 0.5672818791946309
F1 score at threshold 0.14 is 0.574535491052316
F1 score at threshold 0.15 is 0.5809660950386176
F1 score at threshold 0.16 is 0.5886490095704429
F1 score at threshold 0.17 is 0.5944744363289934
F1 score at threshold 0.18 is 0.5989857076994006
F1 score at threshold 0.19 is 0.6038638281909405
F1 score at threshold 0.2 is 0.6101533479378989
F1 score at threshold 0.21 is 0.6142781290727422
F1 score at threshold 0.22 is 0.6182672080622279
F1 score at threshold 0.23 is 0.6215800188688614
F1 score at threshold 0.24 is 0.6246980676328503
F1 score at threshold 0.25 is 0.6281606851549755
F1 score at threshold 0.26 is 0.6310243297690997
F1 score at threshold 0.27 is 0.6335215391049771
F1 score at threshold 0.28 is 0.6362001481011319
F1 score at threshold 0.29 is 0.6389335046578862
F1 score at threshold 0.3

In [0]:
max(thresholds.keys())

0.6469031233456856

In [0]:
# LSTM

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)

x = Bidirectional(layers.LSTM(64))(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.25)(x)
x = Dense(1, activation="sigmoid")(x)

lstm_w2v = Model(inputs=inp, outputs=x)

lstm_w2v.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [0]:
lstm_w2v.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))

Train on 1121542 samples, validate on 124595 samples
Epoch 1/1


<keras.callbacks.History at 0x7f967f2cda58>

In [0]:
prob_lstm_w2v_emb = lstm_w2v.predict([val_X], batch_size=1024, verbose=1)
thresholds = dict()
for thresh in np.arange(0.1, 0.99, 0.01):
    thresh = np.round(thresh, 2)
    f_sc = f1_score(val_y, (prob_lstm_w2v_emb > thresh).astype(int))
    thresholds[f_sc] = thresh
    print("F1 score at threshold {0} is {1}".format(thresh, f_sc))

F1 score at threshold 0.1 is 0.47211032976036044
F1 score at threshold 0.11 is 0.48255691028261327
F1 score at threshold 0.12 is 0.49225523270577143
F1 score at threshold 0.13 is 0.5021086023511849
F1 score at threshold 0.14 is 0.5105991490014183
F1 score at threshold 0.15 is 0.5180121783539581
F1 score at threshold 0.16 is 0.5249164081698424
F1 score at threshold 0.17 is 0.5317519149987645
F1 score at threshold 0.18 is 0.5376063336983071
F1 score at threshold 0.19 is 0.5433988220626801
F1 score at threshold 0.2 is 0.5480508103372755
F1 score at threshold 0.21 is 0.552891270372851
F1 score at threshold 0.22 is 0.556789224608664
F1 score at threshold 0.23 is 0.5606861381548448
F1 score at threshold 0.24 is 0.5659431597862581
F1 score at threshold 0.25 is 0.5698888514651398
F1 score at threshold 0.26 is 0.5736813751897742
F1 score at threshold 0.27 is 0.577547564498456
F1 score at threshold 0.28 is 0.5817923906986169
F1 score at threshold 0.29 is 0.5845313546658423
F1 score at threshold 

In [0]:
# Max F1 score
max(thresholds.keys())

0.5933524825605252

In [0]:
# Bonus content 
from keras.models import Model, Sequential
from keras.layers import Flatten

model_ptw2v = Sequential()
e = Embedding(max_features, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(train_X, train_y, validation_data=(val_X, val_y), epochs=5, batch_size=32, verbose=2)
