In [None]:
import os
import tensorflow as tf

In [None]:
os.listdir()

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers

In [None]:
from keras.datasets import imdb

- Setting maximum number of words to 10,000
- setting max words to be in a review as 50 to speed up the model training. It will affect the evauation of sentiment as first 50 words might not capture all the important words.

In [None]:
top_words = 10000
max_words=50
(x_train, y_train), (x_test, y_test) =imdb.load_data(path="imdb.npz", num_words=top_words,seed=100)

In [None]:
'''
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
import re

tokenizer = Tokenizer(num_words=top_words)
tokenizer.fit_on_sequences(x_train)
'''

- Concatenating train and test to observe full distribution

In [None]:
data = np.concatenate((x_train, x_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

print("Categories:", np.unique(targets))
print("Number of unique words:", len(np.unique(np.hstack(data))))

- Target column has two values
- Total number of unique words are 9998

In [None]:
data.shape

In [None]:
len(data[0])

##### Decoding a review

In [None]:
print("--Movie Review--")
print(data[0])
print("--Sentiment--")
print(targets[0])
print("--Movie Review Length--")
print(len(data[0]))

In [None]:
print(" -— Decoded Movie Review -— ")
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )
print(decoded)

- FOr neural network we have make review length as constant for all reviews
- We can add padding for short reviews and truncate for long ones

In [None]:
reverse_index.get(1)

In [None]:
reverse_index.get(2)

In [None]:
length = [len(i) for i in data]

In [None]:
print("Average review length:",np.mean(length))
print("Median review length:",np.median(length))
print("Standard Devation in reviews:",np.std(length))
print("Max length for reviews:",np.max(length))

- Average review length  is around 235 characters and maximum 2494

In [None]:
plt.figure(figsize=(15,10))
plt.boxplot(length);

- Average review length is around 235 characters having a standard deviation of 173.
- we can notice in the box plot the whisker's are around 500 as max
- we can keep our review length around 500/400 to cover majority of reviews
- As mentioned in the problem statement we will be taking first 20 words only

In [None]:
import seaborn as sns
plt.figure(figsize=(10,5))
sns.countplot(targets)

In [None]:
sns.countplot(y_train)

In [None]:
sns.countplot(y_test)

- Distribution of both the sentiments are evenly distributed

In [None]:
from keras.preprocessing import sequence

- Using pad sequence if any review is falling short in length it will fill it up.

In [None]:

x_train=sequence.pad_sequences(x_train,maxlen=max_words)
x_test=sequence.pad_sequences(x_test,maxlen=max_words)

In [None]:
x_train.shape

In [None]:
x_test.shape

#### Model Creation

 - Embedding
 - Layers - Dropout/batch normalization
 - Dense layers

In [None]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

- Creating a simple dense model with sigmoid activation and binary crossentropy

In [None]:
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=100, verbose=2)

In [None]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
from keras.layers import BatchNormalization

- Expanding the model with few more dense layers

In [None]:
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=100, verbose=2)

In [None]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

- Couldnt see any significant change with increase of dense layers

In [None]:

# CNN for the IMDB problem
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

- Creating a CNN model 

In [None]:
# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
y_train[0]

- We can notice the accuracy is improved from 76% to 82%, with very basic cnn model itself

In [None]:
#Y = pd.get_dummies(data['sentiment']).values

### LSTM Model

- Trying softmax as final activation with sparse-categorical-crossentropy

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(top_words, embed_dim, input_length=max_words))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

- model performance is similar to previous models of cnn

- Updating model with sigmoid activation & loss as binary_crossentropy

In [None]:
model = Sequential()
model.add(Embedding(top_words, embed_dim, input_length=max_words))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=128, verbose=2)

In [None]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:

model = Sequential()
model.add(Embedding(top_words, embed_dim, input_length=max_words))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.01))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
'''
tokenizer = Tokenizer(num_words=top_words, split=' ')
def predictSentiments(twt):
    #vectorizing the tweet by the pre-fitted tokenizer instance

    twt = tokenizer.texts_to_sequences(twt)
    #padding the tweet to have exactly the same shape as `embedding_2` input
    #sequence.pad_sequences(x_train,maxlen=max_words,value = 0.0)
    twt = sequence.pad_sequences(twt, maxlen=max_words, value=0.0)
    print(twt)
    sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
    if(np.argmax(sentiment) == 0):
        return print("negative")
    elif (np.argmax(sentiment) == 1):
        return print("positive")
'''

#### Getting sentiment of custom review

In [None]:
import re



- understanding the keys details

In [None]:
index.keys()

In [None]:
len(index.keys())

In [None]:
index['this']

In [None]:
test_rev="this movie isnt worth a dime"
sample="this movie isnt worth a dime"

In [None]:
test_rev=re.split('\s+',test_rev)

In [None]:
test_rev[0]

In [None]:
[print(i) for i in test_rev]

In [None]:
[print(index[i]) for i in test_rev]

In [None]:
print(" -— Encode Test Review -— ")

index = imdb.get_word_index()
encoded=( [index[i] for i in test_rev] )
#encoded=np.array(encoded)
encoded

In [None]:
len(encoded)

In [None]:
x_train[0]

In [None]:
'''
def vectorize(sequences, dimension = max_words):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        print(i, sequence)
        results[i] = sequence
        print(results)
    return results
 
encoded = vectorize(encoded)
'''

In [None]:

import json,urllib.request
output = urllib.request.urlopen("https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json").read()
output = json.loads(output)
print (output)

###### Function for preprocessing the text to make it a input for the model

In [None]:
index = imdb.get_word_index()
def preprocess_text(sentence):

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    #sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = re.split('\s+',sentence)
    encoded=[]
    #encoded.append( [index[i] for i in sentence] )
    
    for i in sentence:
        if(index[i]>=top_words):
            encoded.append(0)
        else:
            encoded.append(index[i])
    
    encoded=np.asarray([encoded])
    #print(encoded)
    #print(len(encoded))
    encoded = sequence.pad_sequences(encoded, maxlen=max_words)
    print(encoded)
    return encoded

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import tokenizer_from_json

#tokenizer = tokenizer_from_json(json.dumps(output))


In [None]:
preprocess_text(sample)

###### Function for predicting the statement

In [None]:

def predictSentiments(twt):
    instance=preprocess_text(twt)
    print(instance)
    #instance = tokenizer.texts_to_sequences(twt)
    '''
    flat_list = []
    for sublist in instance:
        for item in sublist:
            flat_list.append(item)

    flat_list = [flat_list]
    print(flat_list)
   
    instance = sequence.pad_sequences(instance, maxlen=max_words)
    '''
    sentiment= model.predict(instance)
    print("Prediction:",sentiment)
    if(sentiment < 0.5):
        return print("negative")
    else:
        return print("positive")

In [None]:
predictSentiments("this movie isnt worth a dime")


In [None]:
predictSentiments("dont know what to say")

In [None]:
predictSentiments("this is a great piece")

##### Visualizing Embeddings

In [None]:
# with a Sequential model
import keras
get_embed_out = keras.backend.function(
    [model.layers[0].input],
    [model.layers[1].output])

In [None]:
layer_output = get_embed_out(x_test[0])
print(type(layer_output), len(layer_output), layer_output[0].shape)

In [None]:
words = layer_output[0]
plt.scatter(words[:,0], words[:,1])

In [None]:
words = get_embed_out([x_test[0]])[0]

plt.scatter(words[:,0], words[:,1])
for i, txt in enumerate(x_test[0]):
    plt.annotate(txt, (words[i,0], words[i,1]))

In [None]:
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in x_test[0]] )
print(decoded)

In [None]:
x_test[0]

- Trying to understand how model has grouped the words

In [None]:
print(reverse_index[435],reverse_index[1194],reverse_index[49])

In [None]:
print(reverse_index[681],reverse_index[7995])

In [None]:
print(reverse_index[241],reverse_index[112],reverse_index[72])

In [None]:
print(reverse_index[216],reverse_index[419])

In [None]:
import gc
gc.collect()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

model = Sequential()
model.add(Embedding(top_words, embed_dim, input_length=max_words))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.01))
model.add(Dense(lstm_out,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

#### ADDING MORE LSTM LAYERS

In [None]:
model = Sequential()
model.add(Embedding(top_words, embed_dim, input_length=max_words))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,return_sequences=True, dropout=0.5, recurrent_dropout=0.01))#return sequence for stacking lstm
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,return_sequences=True,dropout=0.3, recurrent_dropout=0.01))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,dropout=0.3, recurrent_dropout=0.01))
model.add(Dense(lstm_out*2,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
predictSentiments("this movie isnt worth a dime")

In [None]:
predictSentiments("this is a great piece")

- We can observe simply adding LSTM layers wont improve the model but we have tocarefully work with hyper parameters, dropout layers and input training data.