In [74]:
from keras.datasets import imdb
import numpy as np
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:1']

#### Load the training and testing data for the model

In [75]:

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=5000)

In [76]:
X_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
       list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369,

#### lets look at a review - in order to look at a review we have to reverse preprocess the IMBD dataset from Keras

In [3]:
NUM_WORDS=5000 # only use top 1000 words
INDEX_FROM=3   # word index offset
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in X_train[0] ))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly <UNK> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they

#### Profile the data

In [4]:
print("data shape for X training data: {}".format(X_train.shape))
print("data shape for X test data: {}".format(X_test.shape))
print("data shape for y training data: {}".format(y_train.shape))
print("data shape for y test data: {}".format(y_test.shape))
print("number of unique words in X test data: {}".format(len(np.unique(np.hstack(X_test)))))

data shape for X training data: (25000,)
data shape for X test data: (25000,)
data shape for y training data: (25000,)
data shape for y test data: (25000,)
number of unique words in X test data: 4997


#### Word embedding to allow for training

In [5]:
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

#### create the model

In [6]:
model = Sequential()
model.add(Embedding(5000, 32, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=128) 
scores = model.evaluate(X_test, y_test)
print(scores)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.3133649311351776, 0.86788]


In [8]:
print(scores)

[0.3133649311351776, 0.86788]


In [14]:
print("The percentage accuracy of the sentiment model is {}%".format(scores[1]*100))

The percentage accuracy of the sentiment model is 86.788%



#### Test with new text

In [131]:
new_sentence_1 = "this movie is bad, and horrible, and I would never ever see it again, the actors were terrible, and the plot is horrible"


### First I will create a function in order to test new sentences, or reviews.

### There are a few steps to do in order to get new text into a format that the model can predict, these steps include.

### obtain the word index from the vocab used in this case IMDB movie reviews, and using that to apply a numerical index of words ids to the new sentence

### We then have to pad out the new word index, as to make sure it is in the same shape of maximum words as the other reviews

### Then flatten the word index into a matrix so it can be passed into the neural network

In [132]:
def test_new_review(review):
    imdb_word_index = imdb.get_word_index()
    review_word_index = [[imdb_word_index[w] for w in review if w in imdb_word_index]]
    padded_review = sequence.pad_sequences(review_word_index, maxlen=max_words)
    matrix = np.array([padded_review.flatten()])
    score = model.predict(matrix)
    print("Threshold for score is < .50 then negative, if above then positive, this review scored: {}".format(score))
    if score < .50:
        print("Therfore a negative review")
    else:
        print("Therefore a positive review")
    

In [130]:
test_new_review(new_sentence_1)

Threshold for score is < .50 then negative, if above then positive, this review scored: [[0.36157355]]
Therfore a negative review


In [137]:
new_sentence_2 = "fantastic movie, I want to see it again"

In [138]:
test_new_review(new_sentence_2)

Threshold for score is < .50 then negative, if above then positive, this review scored: [[0.76853365]]
Therefore a positive review


In [139]:
new_sentence_3 = "I fell asleep in the cinema"

In [140]:
test_new_review(new_sentence_3)

Threshold for score is < .50 then negative, if above then positive, this review scored: [[0.20456791]]
Therfore a negative review
