In [2]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

from sklearn.model_selection import train_test_split

In [3]:
max_features = 200000
maxlen = 80
embedding_dim = 200

In [None]:
data = pd.read_csv('data/train.csv')

In [None]:
X_train, X_val, y_train , y_val = train_test_split(data.iloc[:,1], 
            data.iloc[:,2:], test_size = 0.05, random_state = 7)

tokenizer = Tokenizer(max_features)
tokenizer.fit_on_texts(X_train)
print('{} unique words'.format(len(tokenizer.word_index)))

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen)

In [None]:
model = Sequential()
model.add(Embedding(input_length=maxlen, input_dim=max_features, output_dim=embedding_dim))
model.add(LSTM(64, return_sequences = True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(6))
model.add(Activation('sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics= ['accuracy'])

In [None]:
model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    batch_size=256,
    epochs=10
)