In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input
from keras.layers import Embedding, LSTM
from keras.models import Sequential

import numpy as np
import csv


Using TensorFlow backend.


In [2]:
TRAIN_DATA_PATH = ''

SENTENCE_LENGTH = 100
EMBEDDING_DIM = 50
TEST_SPLIT = 0.2

# Tensorflow throws OOM if trying to use the entire glove embedding, we can reduce the embedding size only to data's input
# Embeddings that are not part of the input won't be used for training anyway
VOCAB_SIZE = 1000

In [3]:
def loadDataSet(path):
    label = []
    data = []

    with open(path,'r') as tsvin:
        tsvin = csv.reader(tsvin, delimiter='\t')
        for row in tsvin:
            label.append(int(row[0]))
            data.append(row[1])
    return data, label

In [4]:
def split(ratio, data, labels):
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    
    data = data[indices]
    labels = labels[indices]
    
    nb_test = int(ratio * data.shape[0])

    x_train = data[:-nb_test]
    y_train = labels[:-nb_test]

    x_test = data[-nb_test:]
    y_test = labels[-nb_test:]
    
    return x_train, y_train, x_test, y_test

In [5]:
data_lines, data_labels = loadDataSet(TRAIN_DATA_PATH)

print('Found %s training instance.' % len(data_lines))

Found 1574 training instance.


In [6]:
# should save the tokenizer so that the text sequence is consistant

tokenizer = Tokenizer(nb_words=VOCAB_SIZE)
tokenizer.fit_on_texts(data_lines)
data_sequences = tokenizer.texts_to_sequences(data_lines)
data_index = tokenizer.word_index
vocab_dim = min(VOCAB_SIZE, len(data_index))

In [7]:
data_padded = pad_sequences(data_sequences, maxlen=SENTENCE_LENGTH)
data_labels = np.asarray(data_labels)

In [8]:
x_train, y_train, x_test, y_test = split(TEST_SPLIT, data_padded, data_labels)

In [9]:
embedding_matrix = np.random.normal(scale= 0.01, size=(vocab_dim,EMBEDDING_DIM)).reshape(vocab_dim, EMBEDDING_DIM)

embedding_layer = Embedding(vocab_dim, EMBEDDING_DIM, weights=[embedding_matrix], input_length=SENTENCE_LENGTH, trainable=False)

In [10]:
print('Training model.')

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2, return_sequences=True)) 
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])

model.fit(x_train, y_train, validation_data=(x_test, y_test),nb_epoch=20, batch_size=20)


Training model.
Train on 1260 samples, validate on 314 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1e949e25940>