In [None]:
import pandas as pd
import itertools

from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

from utils import prjPaths

paths = prjPaths()
print('Loading data...')
rootData_dir = paths.ROOT_DATA_DIR
train = pd.read_csv('{}/imdb_train_filtered.csv'.format(rootData_dir))
test = pd.read_csv('{}/imdb_test_filtered.csv'.format(rootData_dir))

x_train = train['x']
y_train = train['y']
x_test = test['x']
y_test = test['y']

print('shapes after creating dataset')
print('x_train shape: {}'.format(len(x_train)))
print('y_train shape: {}'.format(len(y_train)))
print('x_test shape: {}'.format(len(x_test)))
print('y_test shape: {}'.format(len(y_test)))

print('data glimpse after creating dataset')
print('x_train: {}'.format(x_train[10:12]))
print('y_train: {}'.format(y_train[10:12]))
print('x_test: {}'.format(x_test[10:12]))
print('y_test: {}'.format(y_test[10:12]))

def makeCharLvl(ds):
    seq_all = list()
    for i, seq in enumerate(ds):
        seq_acum = list()
        for char in seq:
            seq_acum.append(char)
        seq_all.append(seq_acum)
    return seq_all

x_train = makeCharLvl(x_train)
x_test = makeCharLvl(x_test)

print('shapes after making char level:')
print('x_train len: {}'.format(len(x_train[:])))
print('x_test len: {}'.format(len(x_test)))

unqVoc = sorted(set(list(itertools.chain.from_iterable(x_train + x_test))))
unqVoc_LookUp = {v: k for k, v in enumerate(unqVoc)}


def indexDS(ds):
    a = list()
    for seq in ds:
        b = list()
        for j, char in enumerate(seq):
            b.append(unqVoc_LookUp[char])
        a.append(b)
    return a


x_train = indexDS(x_train)
x_test = indexDS(x_test)

print('shapes after indexing:')
print('x_train len: {}'.format(len(x_train)))
print('x_test len: {}'.format(len(x_test)))

print('data glimpse after indexing')
print('x_train: {}'.format(x_train[10:12]))
print('x_test: {}'.format(x_test[10:12]))

max_tbd = 0
for idx, text in enumerate(x_train):
    if len(text) > max_tbd:
        max_tbd = len(text)

for idx, text in enumerate(x_test):
    if len(text) > max_tbd:
        max_tbd = len(text)

maxlen = max_tbd
max_features = 20000
batch_size = 64
embedding_dims = 128
epochs = 10

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
print(y_train[10:12])
print(y_test[10:12])


model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=[x_test, y_test])