In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'train.csv', 'embeddings', 'sample_submission.csv']


In [2]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

**Load packages and data**

In [3]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [4]:
def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)


    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    val_X = val_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values  
    
    #shuffling the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))

    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]    
    
    return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index

**Load embeddings**

In [5]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

**CNN Model**

In [6]:
# https://www.kaggle.com/yekenot/2dcnn-textclassifier
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

**Attention layer**

In [7]:
#Defining a class for attention layer to get realtion between many words

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

**LSTM models**

In [9]:
def model_lstm_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
"""def model_gru_srk_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x) # New
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model    
   """

In [None]:
"""def model_lstm_du(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(64, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model"""

**Train and predict**

In [10]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def train_pred(model, epochs=2):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

        best_thresh = 0.5
        best_score = 0.0
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score

        print("Val F1 Score: {:.4f}".format(best_score))

    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y, best_score

**Main part: load, train, pred and blend**

In [11]:
train_X, val_X, test_X, train_y, val_y, word_index = load_and_prec()
embedding_matrix_1 = load_glove(word_index)
#embedding_matrix_2 = load_fasttext(word_index)
embedding_matrix_3 = load_para(word_index)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [12]:
## Simple average: http://aclweb.org/anthology/N18-2031

# We have presented an argument for averaging as
# a valid meta-embedding technique, and found experimental
# performance to be close to, or in some cases 
# better than that of concatenation, with the
# additional benefit of reduced dimensionality  


## Unweighted DME in https://arxiv.org/pdf/1804.07983.pdf

# “The downside of concatenating embeddings and 
#  giving that as input to an RNN encoder, however,
#  is that the network then quickly becomes inefficient
#  as we combine more and more embeddings.”
  
# embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2, embedding_matrix_3], axis = 0)
embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_3], axis = 0)
np.shape(embedding_matrix)

(95000, 300)

In [14]:
"""outputs = []
pred_val_y, pred_test_y, best_score = train_pred(model_gru_srk_atten(embedding_matrix), epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'gru atten srk'])"""

"outputs = []\npred_val_y, pred_test_y, best_score = train_pred(model_gru_srk_atten(embedding_matrix), epochs = 2)\noutputs.append([pred_val_y, pred_test_y, best_score, 'gru atten srk'])"

In [15]:
pred_val_y, pred_test_y, best_score = train_pred(model_cnn(embedding_matrix), epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, '2d CNN'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6643
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6695


In [16]:
pred_val_y, pred_test_y, best_score = train_pred(model_cnn(embedding_matrix_1), epochs = 2) # GloVe only
outputs.append([pred_val_y, pred_test_y, best_score, '2d CNN GloVe'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6678
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6680


In [None]:
"""pred_val_y, pred_test_y, best_score = train_pred(model_lstm_du(embedding_matrix), epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'LSTM DU'])"""

In [17]:
pred_val_y, pred_test_y, best_score = train_pred(model_lstm_atten(embedding_matrix), epochs = 3)
outputs.append([pred_val_y, pred_test_y, best_score, '2 LSTM w/ attention'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6690
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6783
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6832


In [18]:
pred_val_y, pred_test_y, best_score = train_pred(model_lstm_atten(embedding_matrix_1), epochs = 3) # Only GloVe
outputs.append([pred_val_y, pred_test_y, best_score, '2 LSTM w/ attention GloVe'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6631
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-8434cb51f813>", line 1, in <module>
    pred_val_y, pred_test_y, best_score = train_pred(model_lstm_atten(embedding_matrix_1), epochs = 3) # Only GloVe
  File "<ipython-input-10-a9353a76097f>", line 4, in train_pred
    model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
  File "/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/engine/training_arrays.py", line 200, in fit_loop
    outs = fit_function(ins_batch)
  File "/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
 

KeyboardInterrupt: 

In [19]:
pred_val_y, pred_test_y, best_score = train_pred(model_lstm_atten(embedding_matrix_3), epochs = 3) # Only Para
outputs.append([pred_val_y, pred_test_y, best_score, '2 LSTM w/ attention Para'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6604
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6729
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6778


In [20]:
outputs.sort(key=lambda x: x[2]) # Sort the output by val f1 score


In [21]:
outputs

[[array([[4.5948844e-05],
         [4.1317258e-02],
         [8.4310435e-02],
         ...,
         [1.0650881e-03],
         [9.6307969e-01],
         [1.0727404e-04]], dtype=float32), array([[8.8262618e-01],
         [1.6214610e-04],
         [1.6754982e-04],
         ...,
         [6.1862316e-04],
         [9.3250179e-05],
         [1.8865798e-01]], dtype=float32), 0.6679542005793903, '2d CNN GloVe'],
 [array([[6.4782755e-05],
         [1.8799713e-02],
         [2.8227042e-02],
         ...,
         [6.8146613e-04],
         [9.4687712e-01],
         [3.4153380e-04]], dtype=float32), array([[8.7733281e-01],
         [1.2056006e-04],
         [2.2014033e-04],
         ...,
         [8.3534885e-04],
         [7.5849915e-05],
         [1.5771902e-01]], dtype=float32), 0.6695408493209745, '2d CNN'],
 [array([[1.8342795e-04],
         [1.8595459e-02],
         [2.7180471e-02],
         ...,
         [2.1833259e-04],
         [8.7674659e-01],
         [1.1036347e-03]], dtype=float32), a

In [22]:
weights = [i for i in range(1, len(outputs) + 1)]
weights = [float(i) / sum(weights) for i in weights] 
#print(weights)

[0.1, 0.2, 0.3, 0.4]


In [23]:
for output in outputs:
    print(output[2], output[3])

0.6679542005793903 2d CNN GloVe
0.6695408493209745 2d CNN
0.6778132371305675 2 LSTM w/ attention Para
0.6831733938905575 2 LSTM w/ attention


In [24]:
# pred_val_y = np.sum([outputs[i][0] * weights[i] for i in range(len(outputs))], axis = 0)
pred_val_y = np.mean([outputs[i][0] for i in range(len(outputs))], axis = 0) # to avoid overfitting, just take average

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6103997909589758
F1 score at threshold 0.11 is 0.622338999410156
F1 score at threshold 0.12 is 0.630447892111178
F1 score at threshold 0.13 is 0.637969588550984
F1 score at threshold 0.14 is 0.6456603344329428
F1 score at threshold 0.15 is 0.6523754345307069
F1 score at threshold 0.16 is 0.6579785352046231
F1 score at threshold 0.17 is 0.6635093819315389
F1 score at threshold 0.18 is 0.6699212790626715
F1 score at threshold 0.19 is 0.6738000989609103
F1 score at threshold 0.2 is 0.6772334293948127
F1 score at threshold 0.21 is 0.6801321138211381
F1 score at threshold 0.22 is 0.6834629474901985
F1 score at threshold 0.23 is 0.6862082412582868
F1 score at threshold 0.24 is 0.6882963352160778
F1 score at threshold 0.25 is 0.690094997674882
F1 score at threshold 0.26 is 0.6904234615126502
F1 score at threshold 0.27 is 0.6909510242843576
F1 score at threshold 0.28 is 0.6933955787450227
F1 score at threshold 0.29 is 0.6948259120543765
F1 score at threshold 0.3 

In [25]:
# pred_test_y = np.sum([outputs[i][1] * weights[i] for i in range(len(outputs))], axis = 0)
pred_test_y = np.mean([outputs[i][1] for i in range(len(outputs))], axis = 0)

pred_test_y = (pred_test_y > best_thresh).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
#out_df.to_csv("submission.csv", index=False)