# Load texts

The dataset can be downloaded from http://ai.stanford.edu/~amaas/data/sentiment/.
It is binary sentiment classification for the Imdb movie review dataset. This set has 25,000 movie reviews, with 12,500 positive reviews and 12,500 negative reviews. 

In [1]:
import os
import numpy as np
datapath ='aclImdb'

In [2]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [3]:
maxSeqLength = 250

In [4]:
train_dir = os.path.join(datapath, 'train')
labels = []
texts = []
path = [os.path.join(train_dir, 'neg')]

for l in ['neg', 'pos']:
    path = os.path.join(train_dir, l)
    
    if l== 'neg':
        label = [0,1]
    else :
        label = [1,0]
            
    for fname in os.listdir(path):
        fpath = os.path.join(path, fname)
        f = open(fpath)
        t = cleanSentences( f.read())
        texts.append(t)
        
        labels.append(label)


In [5]:
#sample of the data
texts[0]

'i remember watching this movie several times as a very young kid and there were parts of it many in fact that i did not understand i think i have seen it once as an adult and i then understood those parts the only problem with viewing it as an adult was that it was not entertaining to me at all so what kind of movie is this is it a kids movie not hardly it contains language and subject matter not suitable for kids is it a hyperbole of what every parent feels like they are going through with their own children maybe but then why wouldnt it focus more on john ritters character instead of junior when a film has a 7yearold as its main character in order to do well with its audience it should be a movie for the seven and under crowd otherwise people older than that will have no way to relate even 8yearolds wouldnt want to see a movie about a kid who is whole year younger than them im pretty sure this film did not do well in the box office and the reason has to be because it was unable to f

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxSeqLength)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.2 * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Using TensorFlow backend.


Found 111525 unique tokens.
Shape of data tensor: (25000, 250)
Shape of label tensor: (25000, 2)


# loading pretrained word vectors from Glove

We use a smaller and more manageable matrix from GloVe, which contain 400,000 word vectors, each with a dimensionality of 50.

In [7]:
filename = 'glove.6B.50d.txt'
embeddings_index = {}
f = open(filename)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [8]:
EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [9]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxSeqLength,
                            trainable=False)

# RNN (With LSTM units) 


In [10]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, \
    Activation, concatenate, GRU, Embedding, Flatten, LSTM
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers

In [11]:
sequence_input = Input(shape=(maxSeqLength,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(64)(embedded_sequences)
x = Dropout(0.25)(x)
output = Dense(2, activation='softmax')(x)

In [12]:
model = Model(sequence_input, output)

In [None]:
adam = optimizers.Adam(lr=0.001)
model.compile( loss='binary_crossentropy',
              optimizer=adam,
              metrics=['acc'])

In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=20, batch_size=128)

Train on 20000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20