##Movie Sentiment Analysis using RNN

In [22]:
# Import libraries
from keras.datasets import imdb
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [23]:
# IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
# Here we have set of 25,000 highly polar movie reviews for training and 25,000 for testing.
# So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
# only consider the top 10,000 most common words,

In [24]:
# Load the dataset
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [None]:
X_train[0]

In [26]:
# To get shape
print('No of samples in training set',X_train.shape)
print('No of samples in test set',X_test.shape)


No of samples in training set (25000,)
No of samples in test set (25000,)


In [27]:
y_train[0]

1

In [28]:
# To get maximum length of review
print(len(max((X_train+ X_test), key=len)))

2697


In [29]:
# To get minimum length of review
print(len(min((X_train+ X_test), key=len)))

70


In [30]:
# Reshape the y value
# import numpy as np
# y_train = np.asarray(y_train).reshape((-1,1))
# y_test = np.asarray(y_test).reshape((-1,1))

In [31]:
# Apply the padding and decide max length
X_train = pad_sequences(X_train,padding='post',maxlen=100)
X_test = pad_sequences(X_test,padding='post',maxlen=100)

In [32]:
X_train.shape

(25000, 100)

In [33]:
# # using RNN with Embedding Technique
# Word Embedding: Turns positive integers (indexes) into dense vectors of fixed size.
# A word embedding is a class of approaches for representing words and documents using a dense vector representation.
# It is an improvement over more the traditional bag-of-word,OHE model encoding schemes where large sparse vectors were used
# to represent each word or to score each word within a vector to represent an entire vocabulary.
# There are three parameters to the embedding layer
# input_dim : Size of the vocabulary
# output_dim : Length of the vector for each word
# input_length : Maximum length of a sequence

model1 = Sequential()
model1.add(Embedding(10000, 32,input_length=100))  # Unique words(vocabulary size)= 10000, embedding_size=32, maximum length = max_words.
model1.add(SimpleRNN(32)) #return_sequences=False
model1.add(Dense(1, activation='sigmoid'))



In [34]:
# For RNN with embedding
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train,epochs=5,batch_size= 256,validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb56dfa7850>

In [35]:
# To get test accuracy
scores = model1.evaluate(X_test,y_test)
print('test accuracy',scores[1])

test accuracy 0.8311200141906738


In [36]:
# Prediction
y_pred = model1.predict(X_test)
y_pred



array([[0.02652425],
       [0.97763693],
       [0.8914486 ],
       ...,
       [0.02268264],
       [0.02904704],
       [0.1819602 ]], dtype=float32)

In [37]:
# Covert probability to numbers
t1 = []
for i in range(len(y_pred)):
    if y_pred[i] >= 0.5:
        t1.append(1)
    else:
        t1.append(0)

In [38]:
print(t1)

[0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 

In [39]:
# Actual Values
y_test

array([0, 1, 1, ..., 0, 0, 0])