In [53]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)
import csv
import operator
import sys
from keras.preprocessing.text import Tokenizer

In [92]:
vocab_size = 5000

In [78]:
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ", char_level=False)

In [79]:
train_text = []
with open('data/training_nolabel.txt', 'rt') as trainfile:
    for row in trainfile:
        train_text.append(row)

In [80]:
tokenizer.fit_on_texts(train_text) # around 30 seconds

In [81]:
X_train = []
y_train = []
with open('data/training_label.txt', 'rt') as trainfile:
    reader = csv.reader(trainfile, delimiter=' ')
    for idx, row in enumerate(reader):
        words = ' '.join(row[2:]) 
        X_train.append(words)
        y_train.append(row[0])
        # print(row[0])
        # print(words)
X_train = tokenizer.texts_to_sequences(X_train)

In [86]:
print(np.array(X_train[0]))

[ 39 789 451  80]


In [87]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)

In [90]:
# print(X_train[0])

In [94]:
# create the model
embedding_vector_length = 32
model = Sequential()
# input an integer matrix of size (batch, input_length)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_vector_length, input_length=max_review_length))
# model.add(Dropout(0.2))

# CNN before LSTM layer
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))

# dropout for configuring the input dropout and recurrent_dropout for configuring the recurrent dropout
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 180000 samples, validate on 20000 samples
Epoch 1/3
   960/180000 [..............................] - ETA: 6666s - loss: 0.6926 - acc: 0.5031 

KeyboardInterrupt: 