In [1]:
from tensorflow.contrib.keras.python.keras.preprocessing import sequence
from tensorflow.contrib.keras.python.keras.preprocessing.text import Tokenizer
from tensorflow.contrib.keras.python.keras.models import Sequential, Model
from tensorflow.contrib.keras.python.keras.layers import Dense, Embedding, LSTM, Input, merge, BatchNormalization, Dropout
from tensorflow.contrib.keras.python.keras.datasets import imdb
from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard

import numpy as np
import os

In [2]:
# variour variable
max_features = 20000
maxlen = 200

In [3]:
x_train = []
y_train = []

In [4]:
path='./aclImdb/train/pos/'
x_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_train.extend([1 for _ in range(12500)])

In [5]:
path='./aclImdb/train/neg/'
x_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_train.extend([0 for _ in range(12500)])

In [6]:
print('x:')
print(x_train[:1])

x:
['This movie makes me want to fall in love all over again!I am naming my next daughter "Adelaide". Just so that someone who sings like Ol Blue eyes can swoon her one day, and feel the butterflies I felt hearing it sung, and it wasn\'t even to me! I give it a 9/10']


In [7]:
print('y:')
print(y_train[:1])

y:
[1]


In [8]:
print(len(x_train))
print(len(y_train))

25000
25000


In [9]:
x_test = []
y_test = []

In [10]:
path='./aclImdb/test/pos/'
x_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_test.extend(1 for _ in range(12500))

In [11]:
path='./aclImdb/test/neg/'
x_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_test.extend(0 for _ in range(12500))

In [12]:
print('x:')
print(x_test[:1])

x:
['Boogie Nights is perhaps one of the greatest examples any would-be filmmaker should take a long hard look at. Sure, you could spend loads of quality time reviewing the clasics from Hitchcock to Scorsese; but lets follow suit for the modern generation and study half-heartedly.<br /><br />Where to begin, I suppose one could look at the film as simply a story, perhaps even docudrama which focuses on the late 1970\'s porn industry-and what an industry it was! The other half could focus on the incredible detail one brillant filmmaker can achieve simply by using polyester and *ahem* rubber. But honestly, Boogie Nights brings back the pure, no-bul!shi$, in your face kind of cinema I haven\'t experienced since the film greats of the 1970\'s...ironic...or stroke of genius. The story is full of richly detailed characters, all of which you either can relate too, love, or hate; but the impact is clear-you are feeling something for them. Among the characters the two performances which stand ou

In [13]:
print('y:')
print(y_test[:1])

y:
[1]


In [14]:
print(len(x_test))
print(len(y_test))

25000
25000


In [15]:
imdbTokenizer = Tokenizer(num_words=max_features)

In [16]:
imdbTokenizer.fit_on_texts(x_train)

In [17]:
# print top 20 words
for word, value in imdbTokenizer.word_index.items():
    if value < 20:
        print(value, word)

1 the
11 this
5 to
6 is
7 br
15 for
19 film
10 i
14 as
9 it
4 of
8 in
16 with
12 that
13 was
18 but
3 a
2 and
17 movie


In [18]:
# create int to word dictionary
intToWord = {}
for word, value in imdbTokenizer.word_index.items():
    intToWord[value] = word

In [19]:
# add a symbol for null placeholder
intToWord[0] = "!!!NA!!!"

print(intToWord[1])
print(intToWord[2])
print(intToWord[32])

the
and
an


In [20]:
# convert word strings to integer sequence lists
print(x_train[0])
print(imdbTokenizer.texts_to_sequences(x_train[:1]))
for value in imdbTokenizer.texts_to_sequences(x_train[:1])[0]:
    print(intToWord[value],)

This movie makes me want to fall in love all over again!I am naming my next daughter "Adelaide". Just so that someone who sings like Ol Blue eyes can swoon her one day, and feel the butterflies I felt hearing it sung, and it wasn't even to me! I give it a 9/10
[[11, 17, 163, 69, 178, 5, 806, 8, 116, 29, 117, 171, 10, 241, 10923, 58, 372, 575, 11696, 40, 35, 12, 291, 34, 3130, 37, 1332, 520, 67, 38, 28, 248, 2, 231, 1, 16550, 10, 417, 2229, 9, 5311, 2, 9, 283, 57, 5, 69, 10, 199, 9, 3, 786, 155]]
this
movie
makes
me
want
to
fall
in
love
all
over
again
i
am
naming
my
next
daughter
adelaide
just
so
that
someone
who
sings
like
blue
eyes
can
her
one
day
and
feel
the
butterflies
i
felt
hearing
it
sung
and
it
wasn't
even
to
me
i
give
it
a
9
10


In [21]:
x_train = imdbTokenizer.texts_to_sequences(x_train)
x_test = imdbTokenizer.texts_to_sequences(x_test)

In [22]:
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

25000 train sequences
25000 test sequences


In [23]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape: ', x_train.shape)
print('x_test shape: ', x_test.shape)

x_train shape:  (25000, 200)
x_test shape:  (25000, 200)


In [24]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [25]:
# set model hyper parameters
epochs = 15
embedding_neurons = 128
lstm_neurons = 64
batch_size = 32

In [26]:
input_1 = Input(shape=(maxlen, ), dtype='int32')

In [27]:
embedding = Embedding(max_features, embedding_neurons, input_length=maxlen)(input_1)

In [28]:
bnorm = BatchNormalization()(embedding)

In [29]:
forwards = LSTM(lstm_neurons, dropout=0.2, recurrent_dropout=0.2)(bnorm)

In [30]:
after_dp = Dropout(0.5)(forwards)

In [31]:
output = Dense(1, activation='sigmoid')(after_dp)

In [32]:
model = Model(inputs=sequence1, outputs=output)

In [33]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 128)          512       
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,609,985.0
Trainable params: 2,609,729.0
Non-trainable params: 256.0
__________________________________________________________

In [34]:
tensorboard = TensorBoard(log_dir='./lstm_logs')

In [37]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), callbacks=[tensorboard], verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/15
190s - loss: 0.4840 - acc: 0.7655 - val_loss: 0.4242 - val_acc: 0.8214
Epoch 2/15
189s - loss: 0.2563 - acc: 0.9016 - val_loss: 0.3245 - val_acc: 0.8734
Epoch 3/15
189s - loss: 0.1538 - acc: 0.9451 - val_loss: 0.3668 - val_acc: 0.8622
Epoch 4/15
187s - loss: 0.0908 - acc: 0.9688 - val_loss: 0.4262 - val_acc: 0.8606
Epoch 5/15
188s - loss: 0.0590 - acc: 0.9794 - val_loss: 0.5070 - val_acc: 0.8607
Epoch 6/15
187s - loss: 0.0439 - acc: 0.9854 - val_loss: 0.5800 - val_acc: 0.8590
Epoch 7/15
186s - loss: 0.0314 - acc: 0.9898 - val_loss: 0.6515 - val_acc: 0.8478
Epoch 8/15
189s - loss: 0.0316 - acc: 0.9898 - val_loss: 0.6443 - val_acc: 0.8587
Epoch 9/15
188s - loss: 0.0229 - acc: 0.9928 - val_loss: 0.7170 - val_acc: 0.8608
Epoch 10/15
184s - loss: 0.0169 - acc: 0.9944 - val_loss: 0.7275 - val_acc: 0.8567
Epoch 11/15
189s - loss: 0.0147 - acc: 0.9956 - val_loss: 0.8177 - val_acc: 0.8472
Epoch 12/15
187s - loss: 0.0134 - acc: 0.9962 

<tensorflow.contrib.keras.python.keras.callbacks.History at 0x7f19559810b8>

In [39]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=2)

In [40]:
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.859218871113
Test accuracy: 0.85496


In [41]:
import json
model_json = model.to_json()
with open('model_json.json', 'w') as file:
    json.dump(model_json, file)

In [42]:
model.save_weights('model_weight.h5')