In [1]:
import keras.datasets.imdb as imdb

In [2]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

print(len(X_train), "Training sequences")
print(len(X_test), "Validation sequences")


25000 Training sequences
25000 Validation sequences


In [3]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 

In [4]:
def sequence2text(array):
    decoded = " ".join( [reverse_index.get(i-3 , "") for i in array] )
    return decoded[1:] #remove first space from join()

In [5]:
sequence2text(X_train[0])

"this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly  was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little  that played the  of norman and paul they were just brilliant children are often left out of the  list i think because the stars that play them all grown up are such a big  for the whole film but these children are amazing and should be  for what they have done don't you think the whole story was so lovely 

In [6]:
import numpy as np
import pickle
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [7]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [8]:
sequence2text(X_train[0])

"                                                                                                                                                                                                                                                                                          this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly  was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little  that played the  of norma

In [9]:
embeddings_index = {}
f = open('glove.6B.100d.txt','r',encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400001 word vectors.


In [10]:
len(index)

88584

In [11]:
embedding_matrix = np.zeros((len(index) + 1, 100))
for word, i in index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
embedding_matrix.shape

(88585, 100)

In [13]:
embedding_layer = Embedding(input_dim= embedding_matrix.shape[0],
                            output_dim = embedding_matrix.shape[1],
                            input_length= max_review_length,
                            weights=[embedding_matrix],
                            trainable=False)

In [14]:
# create the model
model = Sequential()

model.add(embedding_layer)
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 100)          8858500   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 8,939,001
Trainable params: 80,501
Non-trainable params: 8,858,500
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x295fd15aa00>

In [15]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 73.18%
