## BiDirectional LSTM baseline

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,CuDNNLSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

from nltk import sent_tokenize

Using TensorFlow backend.


We include the GloVe word vectors in our input files. To include these in your kernel, simple click 'input files' at the top of the notebook, and search 'glove' in the 'datasets' section.

In [2]:
EMBEDDING_FILE='../input/glove6b50d/glove.6B.50d.txt'

In [3]:
test = pd.read_csv('../input/he-accenture/test.csv')
train = pd.read_csv('../input/he-accenture/train.csv')

In [4]:
train.head()

Unnamed: 0,UID,comment,date,parent_comment,score
0,Tr-1,NC and NH.,2016-10,"Yeah, I get that argument. At this point, I'd ...",2
1,Tr-2,You do know west teams play against west teams...,2016-11,The blazers and Mavericks (The wests 5 and 6 s...,-4
2,Tr-3,"They were underdogs earlier today, but since G...",2016-09,They're favored to win.,3
3,Tr-4,"This meme isn't funny none of the ""new york ni...",2016-10,deadass don't kill my buzz,-8
4,Tr-5,I could use one of those tools.,2016-12,Yep can confirm I saw the tool they use for th...,6


Set some basic config parameters:

In [5]:
embed_size = 50
max_features = 20000
maxlen = 100

Read in our data and replace missing values:

In [6]:
list_sentences_train = train["parent_comment"].fillna("_na_").values
y = train['score'].values
list_sentences_test = test["parent_comment"].fillna("_na_").values

In [7]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [9]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  """Entry point for launching an IPython kernel.


(0.020940498, 0.6441043)

In [10]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [11]:
embedding_matrix[0]

array([-0.47107019, -1.10401832,  1.29062063,  0.89376371,  0.40853162,
        0.27875739, -0.69818692,  0.62945259,  0.87593289,  0.57920566,
        0.17053255, -0.34860669,  0.29394367,  0.96782405, -1.47327159,
        0.37755123, -0.99562403,  0.81564513,  0.90965464,  0.33451224,
       -0.24781101, -0.7812317 , -0.34947127,  0.02051574, -0.65041278,
       -0.17706355, -0.21324179,  0.34074742,  0.8865554 , -0.83005841,
        0.40199546,  0.16029716, -0.35504682,  0.47784263, -1.21908015,
        0.47664462,  0.47666782, -0.08967921, -0.34191007, -0.40728989,
        0.19196549, -0.55590391,  0.2833674 ,  0.43772412, -0.06909767,
        0.06685626,  0.48040836, -0.20854702,  0.67613445,  0.02588322])

In [12]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

In [13]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNLSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(X_t, y, batch_size=32, epochs=3, validation_split=0.1);

Train on 40500 samples, validate on 4500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


And finally, get predictions for the test set and prepare a submission CSV:

In [15]:
prediction = model.predict([X_te])

In [16]:
# prediction[:50]

In [17]:
submission = pd.DataFrame.from_dict({'UID': test['UID']})
submission['score'] = prediction
submission.to_csv('submission.csv', index=False)