In [31]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
print("Train shape : ", train_df.shape)
print("Test shape : ", test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
print(pd.DataFrame.info(test_df))
print(pd.DataFrame.info(train_df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375806 entries, 0 to 375805
Data columns (total 2 columns):
qid              375806 non-null object
question_text    375806 non-null object
dtypes: object(2)
memory usage: 5.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
qid              1306122 non-null object
question_text    1306122 non-null object
target           1306122 non-null int64
dtypes: int64(1), object(2)
memory usage: 29.9+ MB
None


In [4]:
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=0)

In [5]:
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].values
val_X = val_df["question_text"].values
test_X = test_df["question_text"].values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [6]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 1

In [7]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f63fbcaae80>

In [8]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))



F1 score at threshold 0.1 is 0.5784884957257687
F1 score at threshold 0.11 is 0.5863810518982933
F1 score at threshold 0.12 is 0.5949192507896961
F1 score at threshold 0.13 is 0.6006983810258039
F1 score at threshold 0.14 is 0.6072934525740235
F1 score at threshold 0.15 is 0.611993028404541
F1 score at threshold 0.16 is 0.6165132336018412
F1 score at threshold 0.17 is 0.6212504869497467
F1 score at threshold 0.18 is 0.6255807057428091
F1 score at threshold 0.19 is 0.6304914744232699
F1 score at threshold 0.2 is 0.6342382722966433
F1 score at threshold 0.21 is 0.6366310574392321
F1 score at threshold 0.22 is 0.6385158087274627
F1 score at threshold 0.23 is 0.6405422579961872
F1 score at threshold 0.24 is 0.6418639609169485
F1 score at threshold 0.25 is 0.6437605167453727
F1 score at threshold 0.26 is 0.6444517760914991
F1 score at threshold 0.27 is 0.6446666666666666
F1 score at threshold 0.28 is 0.645878981965279
F1 score at threshold 0.29 is 0.6474257201179405
F1 score at threshold 0.

In [9]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1



In [10]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

In [11]:
!ls ./data

glove.840B.300d			sample_submission.csv  wiki-news-300d-1M
GoogleNews-vectors-negative300	test.csv
paragram_300_sl999		train.csv


In [13]:
EMBEDDING_FILE = './data/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



  """


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17  

In [14]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f636842f278>

In [15]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))



F1 score at threshold 0.1 is 0.5982450103234687
F1 score at threshold 0.11 is 0.6058021978021978
F1 score at threshold 0.12 is 0.6122869955156951
F1 score at threshold 0.13 is 0.6190606725146198
F1 score at threshold 0.14 is 0.6244133636912784
F1 score at threshold 0.15 is 0.6288654926585147
F1 score at threshold 0.16 is 0.633236040001914
F1 score at threshold 0.17 is 0.6362710384634039
F1 score at threshold 0.18 is 0.6395914158031724
F1 score at threshold 0.19 is 0.64464454504751
F1 score at threshold 0.2 is 0.6480931029270997
F1 score at threshold 0.21 is 0.6511556867936056
F1 score at threshold 0.22 is 0.6542219479584491
F1 score at threshold 0.23 is 0.6572691807542262
F1 score at threshold 0.24 is 0.6594145777497503
F1 score at threshold 0.25 is 0.6611807177744744
F1 score at threshold 0.26 is 0.6623077335334155
F1 score at threshold 0.27 is 0.6636756756756756
F1 score at threshold 0.28 is 0.6651376146788991
F1 score at threshold 0.29 is 0.6668504935752495
F1 score at threshold 0.3

In [16]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [17]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [18]:
EMBEDDING_FILE = './data/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  """


In [20]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f62d1401390>

In [21]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))    

F1 score at threshold 0.1 is 0.5801220575414124
F1 score at threshold 0.11 is 0.5863948592953689
F1 score at threshold 0.12 is 0.5924925735889819
F1 score at threshold 0.13 is 0.5969576538303413
F1 score at threshold 0.14 is 0.6010458605210792
F1 score at threshold 0.15 is 0.6039325842696629
F1 score at threshold 0.16 is 0.6073273614863264
F1 score at threshold 0.17 is 0.6104779235705392
F1 score at threshold 0.18 is 0.6134185303514377
F1 score at threshold 0.19 is 0.6155800996969993
F1 score at threshold 0.2 is 0.6185658589547451
F1 score at threshold 0.21 is 0.6194531600179292
F1 score at threshold 0.22 is 0.6216596343178622
F1 score at threshold 0.23 is 0.6228860759493671
F1 score at threshold 0.24 is 0.6239207070965105
F1 score at threshold 0.25 is 0.62617304320924
F1 score at threshold 0.26 is 0.6269432745801488
F1 score at threshold 0.27 is 0.629808702964053
F1 score at threshold 0.28 is 0.6294706723891274
F1 score at threshold 0.29 is 0.6300608909304563
F1 score at threshold 0.3

In [22]:
pred_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [23]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [24]:
EMBEDDING_FILE = './data/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  """


In [25]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f62aeab1b70>

In [26]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6101603314718069
F1 score at threshold 0.11 is 0.6189838955285867
F1 score at threshold 0.12 is 0.6257888292361308
F1 score at threshold 0.13 is 0.6310949324647955
F1 score at threshold 0.14 is 0.6368028107158542
F1 score at threshold 0.15 is 0.6426689205986179
F1 score at threshold 0.16 is 0.646829613885937
F1 score at threshold 0.17 is 0.6516218578111345
F1 score at threshold 0.18 is 0.6550268410903215
F1 score at threshold 0.19 is 0.6576457529467731
F1 score at threshold 0.2 is 0.6604481612522782
F1 score at threshold 0.21 is 0.662868923611111
F1 score at threshold 0.22 is 0.6647998682259924
F1 score at threshold 0.23 is 0.6664443703456707
F1 score at threshold 0.24 is 0.6685033172157877
F1 score at threshold 0.25 is 0.6708795900939367
F1 score at threshold 0.26 is 0.6713045478065889
F1 score at threshold 0.27 is 0.6720873504472065
F1 score at threshold 0.28 is 0.6722955145118734
F1 score at threshold 0.29 is 0.6719895907262834
F1 score at threshold 0.

In [27]:
pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [28]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [29]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y 
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5955171528787749
F1 score at threshold 0.11 is 0.6028532786527638
F1 score at threshold 0.12 is 0.6095119455443342
F1 score at threshold 0.13 is 0.6152523195293053
F1 score at threshold 0.14 is 0.6210985980234429
F1 score at threshold 0.15 is 0.6263408264154463
F1 score at threshold 0.16 is 0.6321773735076748
F1 score at threshold 0.17 is 0.6362719192889743
F1 score at threshold 0.18 is 0.6400740307812195
F1 score at threshold 0.19 is 0.6434920007900454
F1 score at threshold 0.2 is 0.6476180944755805
F1 score at threshold 0.21 is 0.6507060788581264
F1 score at threshold 0.22 is 0.653393387245368
F1 score at threshold 0.23 is 0.6562435286808863
F1 score at threshold 0.24 is 0.6592635273165365
F1 score at threshold 0.25 is 0.6614806583055513
F1 score at threshold 0.26 is 0.6643117105544851
F1 score at threshold 0.27 is 0.6666666666666667
F1 score at threshold 0.28 is 0.6684164479440069
F1 score at threshold 0.29 is 0.6697242636901144
F1 score at threshold 0

In [30]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)