In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

#import libaries for word tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 20000

Reading the data

In [None]:
train_data = pd.read_csv('../input/train.csv',usecols=[1, 2])
X = train_data.iloc[:,0]
y = train_data.iloc[:,1]

SEQ_LEN = int(X.str.len().quantile(0.7))

Tokenizing and splitting the data

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
X = pad_sequences(sequences, maxlen=SEQ_LEN)
X[0]

Splitting the data to train and valid

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(42)  # fix the randomness for reproducibility
X_train, X_valid,y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

Importing the embedding matrix

In [None]:
embeddings_index = {}
f = open(os.path.join("../input/embeddings/glove.840B.300d", 'glove.840B.300d.txt'))
for line in f:
    values = line.split()
    word = "".join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Creating a f1 metric class, used to evaluate the f1 score after each epoch

In [None]:
from sklearn.metrics import f1_score


def my_metric(y_true, y_pred):
     return f1_score(y_true, y_pred)

Now creating the LSTM

In [None]:
from keras.layers import Embedding
from keras.layers import Input, Dense, Dropout, Embedding, CuDNNLSTM, Flatten, Bidirectional, Dropout, GlobalMaxPool1D
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

def create_model(params_list, num_of_sampling, X, y, num_of_epoches):
    inputs = Input(shape=(SEQ_LEN, ))
    #this is embedding layer with randomized inital weights
    # embedding_layer = Embedding(len(word_index) + 1,
    #                             300,
    #                             input_length=SEQ_LEN)(inputs)
    #uncomment this to use the embedding matrix

    embedding_layer = Embedding(len(word_index) + 1,
                                300,
                                weights=[embedding_matrix],
                                input_length=SEQ_LEN,
                               trainable=False)(inputs)

    x = Bidirectional(CuDNNLSTM(params_list[0], return_sequences=True))(embedding_layer)
    x = Bidirectional(CuDNNLSTM(params_list[1], return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(params_list[2], activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(params_list[3], activation='relu')(x)
    predictions = Dense(1, activation='sigmoid')(x)
    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model = Model(inputs=[inputs], outputs=predictions)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    #filepath="weights.hdf5"
    
    #checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    earlystopping = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=1,
                              verbose=0, mode='auto')
    history = model.fit(X[:num_of_sampling], y[:num_of_sampling], 
                        validation_data=(X_valid[:num_of_sampling], y_valid[:num_of_sampling]),
                        batch_size=200, verbose=1, 
                        shuffle=True, epochs=num_of_epoches, callbacks=[earlystopping])
    return model

Cumputes the best f1 score for each model, with the best treshold 

In [None]:

def compute_f1(model):
    predicted = model.predict(X_valid)
    f1_best = 0
    for treshold in np.linspace(0.1,0.55,15):  # beacuse we use a sigmoid function, we can try diffrent threshold to determine 0 or 1 in predictions
        predicted[predicted < treshold] = 0
        predicted[predicted != 0] = 1
        f1_temp = f1_score(y_valid, predicted)
        if f1_temp > f1_best:
            f1_best = f1_temp
            best_tresh = treshold
    return f1_best, best_tresh

Creating random grid search in order to find the best parameters - optional, unmark to use


def random_grid_search(params_list_of_lists, num_of_searches):
    counter = 0 # use to contorl the number of searches
    parameters_list = [] # list of parameters we already tried
    best_f1=0
    while counter < num_of_searches:
        final_params = [np.random.choice(param_list, 1)[0] for param_list in params_list_of_lists]
        if final_params in parameters_list:  # if we already randomed this combination of parameters, try again
            continue
        parameters_list.append(final_params)
        print(f"iter num #{counter}")
        f1_temp, tresh = compute_f1(create_model(final_params, X=X_train, y=y_train, num_of_sampling=100000))
        if f1_temp > best_f1:
            best_f1 = f1_temp
            best_tresh = tresh
            best_params = lst
        counter += 1
    return best_params, best_f1, best_tresh

Training the model 

In [None]:
model = create_model(params_list= [64, 64, 30, 200, 'adam'], num_of_sampling=-1, X=X, y=y, num_of_epoches=20)


Preparing the test data for prediction

In [None]:
test_data = pd.read_csv('../input/test.csv')
X_test = test_data.question_text.values
sequences = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
X_test = pad_sequences(sequences, maxlen=SEQ_LEN)

Finding the best treshold

In [None]:
#f1_best, best_tresh = compute_f1(model)

Now creating the predictions in order to submit the results

In [None]:
y_pred = model.predict(X_test)
y_pred[y_pred < 0.3] = 0
y_pred[y_pred != 0] = 1
y_pred = y_pred.astype(int)
final_submission = test_data.drop(['question_text'], axis=1)
final_submission['prediction'] = y_pred
final_submission.to_csv("submission.csv", index=False)