In [13]:
import numpy as np
import pandas as pd
from keras import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Conv1D, MaxPooling1D, CuDNNLSTM, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import multi_gpu_model
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split


In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [3]:
maxlen = 72

train_df["question_text"] = train_df["question_text"].str.lower()
test_df["question_text"] = test_df["question_text"].str.lower()

X = train_df["question_text"].fillna("_NA").values
X_test = test_df["question_text"].fillna("_NA").values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(X))
X = tokenizer.texts_to_sequences(X)
X_test = tokenizer.texts_to_sequences(X_test)

X = pad_sequences(X, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)


In [4]:
Y = train_df['target'].values
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)


## Improvement 1

We can improve the simple Keras architecture by using pre-trained embedding weights from GloVe

In [5]:
voc_size, emb_dim = 50000, 100

embeddings_index = dict()
f = open('glove.840B.300d/glove.840B.300d.txt')
    
for line in f:
    values = line.split()
    word = ''.join(values[:-emb_dim])
    coefs = np.asarray(values[-emb_dim:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

embedding_matrix = np.zeros((voc_size, emb_dim))
for word, index in tokenizer.word_index.items():
    if index > voc_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector


# Improvement 2

We add Dropout, Conv1D, Maxpooling1D, CuDNNLSTM to reduce train time (but some of them decreased accuracy)

Batch size was increased too.

In [40]:
model = Sequential()
model.add(Embedding(voc_size, emb_dim, input_length=maxlen, weights=[embedding_matrix]))
# model.add(Dropout(0.2))
# model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
# model.add(LSTM(10))
model.add(Bidirectional(CuDNNLSTM(32)))
model.add(Dense(1, activation='sigmoid'))
model = multi_gpu_model(model, gpus=2)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=256)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_18_input (InputLayer) (None, 72)           0                                            
__________________________________________________________________________________________________
lambda_35 (Lambda)              (None, 72)           0           embedding_18_input[0][0]         
__________________________________________________________________________________________________
lambda_36 (Lambda)              (None, 72)           0           embedding_18_input[0][0]         
__________________________________________________________________________________________________
sequential_18 (Sequential)      (None, 1)            5034369     lambda_35[0][0]                  
                                                                 lambda_36[0][0]                  
__________

<keras.callbacks.History at 0x7fb7dc709668>

Time per epoch:

- Simple LSTM: 17 minutes
- Optimized LSTM: 45 seconds (22x faster)

In [41]:
scores = model.evaluate(X_val, y_val, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Accuracy: 95.40%


In [42]:
preds = model.predict(X_val)


In [43]:
y_pred = np.where(preds > 0.5, 1, 0)
confusion_matrix(y_val, y_pred)


array([[239076,   5987],
       [  6022,  10140]])

In [44]:
f1_score(y_val, y_pred)

0.6280776735111028

We can in fact reach a better f1-score with less train time with these two improvements, we can also make more in-deep preprocessing by adding TFIDF features, process english words contractions, mispellings and exaggerated word lenghts ("damnnn" instead of "damn"), using the mean of GloVe, Google News and paragram embeddings or try another architecture (GRU).