In [43]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.datasets import imdb
import tensorflow as tf
import numpy as np

In [58]:
max_features = 5000
# cut texts after this number of words (among top max_features most common words)
INDEX_FROM=3

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, index_from=INDEX_FROM)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [59]:
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
word_to_id["<UNUSED>"] = 3

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[0] ))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly <UNK> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they

In [60]:
maxlen = 100
batch_size = 32

In [61]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [62]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


In [64]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(x_test, y_test))


Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x20a275e13c8>

In [65]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size, verbose=0)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.36784964403152465
Test accuracy: 0.84948


In [66]:
model.save_weights('model/model.h5')

In [67]:
import os

model_json = model.to_json()
with open(os.path.join("model/model.json"), "w") as json_file:
    json_file.write(model_json)

In [76]:
import re
import json

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words

def preprocess_input(text, vocab_dict, maxlen=100):
    review = review_to_words(text)
    tokens = review.split()
    int_tokens = [vocab_dict[token] for token in tokens]
    
    if len(int_tokens) >= maxlen:
        return int_tokens[:maxlen]
    else:
        diff = maxlen - len(int_tokens)
        zeros = [0 for i in range(diff)]
        return [zeros + int_tokens]

In [81]:
with open('vocab_dict.json', 'w') as f:
    json.dump(word_to_id, f)

In [51]:
text = "This movie was the worst movie ever i have ever seen!"

In [78]:
x_sample = preprocess_input(text, word_to_id)

In [79]:
model.predict(x_sample)

array([[0.00538306]], dtype=float32)