# Sentiment classification of IMDB movie ratings

 ## Using EMBEDDINGS and LSTM

<img src="plan.png" height="500">

In [1]:
import keras.datasets.imdb as imdb
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [2]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

print(len(X_train), "Training sequences")
print(len(X_test), "Validation sequences")

25000 Training sequences
25000 Validation sequences


In [3]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 

In [4]:
def sequence2text(array):
    decoded = " ".join( [reverse_index.get(i-3 , "") for i in array] )
    return decoded[1:] #remove first space from join()

In [5]:
sequence2text(X_train[0])

"this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly  was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little  that played the  of norman and paul they were just brilliant children are often left out of the  list i think because the stars that play them all grown up are such a big  for the whole film but these children are amazing and should be  for what they have done don't you think the whole story was so lovely 

In [7]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [8]:
sequence2text(X_train[0])

"                                                                                                                                                                                                                                                                                          this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly  was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little  that played the  of norma

In [12]:
embedding_layer = Embedding(input_dim= top_words,
                            output_dim = 100,
                            input_length= max_review_length)

In [13]:
# create the model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=100)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 100)          500000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 580,501
Trainable params: 580,501
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
print(history.history)

{'loss': [0.46567636728286743], 'accuracy': [0.7703999876976013], 'val_loss': [0.349908709526062], 'val_accuracy': [0.8555600047111511]}


In [19]:
y_pred = model.predict(X_test)

In [49]:
k = 0
while True:
    i = np.random.randint(len(X_test))
    if k == 3 : break
    pred = np.round(y_pred[i][0])
    true = y_test[i]
    if pred != true:
        print("Predicted :",pred, " True class : ",true, " Certainty : ", np.abs(true - y_pred[i][0]) )
        print(sequence2text(X_test[i]))
        print()
        k += 1

Predicted : 0.0  True class :  1  Certainty :  0.6934072971343994
                                                                                                                                                                                                                                                                                                                                        many people have said that this movie was not a good movie at a horror perspective i agree it was not very scary it did have some gruesome ways of torture yes  going where they shouldn't but not scary but it was a good movie at a comedic stand point i thought it was hilarious such bad acting from   with his stupid   and every second word he said came out as what is  then the angry mob at his house with one of them holding a sign saying we're not gonna take it which is a song from his band twisted sister but i think the part that made me laugh the most was the one  wife that was dead and   was holdin

In [51]:
from sklearn.metrics import classification_report
target_names = ['Negative','Positive']
print(classification_report(y_test, np.round(y_pred), target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.82      0.91      0.86     12500
    Positive       0.90      0.80      0.85     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



In [None]:
# embeddings_index = {}
# f = open('glove.6B.100d.txt','r',encoding="utf-8")
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# embedding_matrix = np.zeros((len(index) + 1, 100))
# for word, i in index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

In [None]:
# embedding_layer = Embedding(input_dim= embedding_matrix.shape[0],
#                             output_dim = embedding_matrix.shape[1],
#                             input_length= max_review_length,
#                             weights=[embedding_matrix],
#                             trainable=False)