In [1]:
import os
import pandas as pd
import numpy as np

#tensorflow modules
!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM, Dropout, BatchNormalization, Flatten
from tensorflow.keras.models import Sequential 

#text preprocessing modules
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
import re
import nltk
nltk.download('punkt')

import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df=pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [6]:
#text cleaning
 #remove html tags
cleaned=[]
for r in df['review']:
    r=re.sub(r'<.*?>',"",r)
    r=re.sub(r'\S+@\S+\s','',r)
    r=re.sub(r'\d+','',r)
    r=re.sub(r'[#$!\*\)\(\\%:;,\'_-]','',r)
    cleaned.append(r)

In [7]:
cleaned[0:3]  

['One of the other reviewers has mentioned that after watching just  Oz episode youll be hooked. They are right as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO. Trust me this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs sex or violence. Its is hardcore in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda. Em City is home to many..Aryans Muslims gangstas Latinos Christians Italians Irish and more....so scuffles death stares dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare. Forg

In [8]:
len(cleaned)

50000

In [9]:
#removing stop words using gensim library function
text = [remove_stopwords(sentence) for sentence in cleaned]

In [11]:
text[0:3]

['One reviewers mentioned watching Oz episode youll hooked. They right exactly happened me.The thing struck Oz brutality unflinching scenes violence set right word GO. Trust faint hearted timid. This pulls punches regards drugs sex violence. Its hardcore classic use word.It called OZ nickname given Oswald Maximum Security State Penitentary. It focuses mainly Emerald City experimental section prison cells glass fronts face inwards privacy high agenda. Em City home many..Aryans Muslims gangstas Latinos Christians Italians Irish more....so scuffles death stares dodgy dealings shady agreements far away.I main appeal fact goes shows wouldnt dare. Forget pretty pictures painted mainstream audiences forget charm forget romance...OZ doesnt mess around. The episode I saw struck nasty surreal I I ready I watched I developed taste Oz got accustomed high levels graphic violence. Not violence injustice crooked guards wholl sold nickel inmates wholl kill order away mannered middle class inmates turn

In [12]:
len(text[0])

1126

In [13]:
#tokenisation
tokenizer = Tokenizer(num_words=8000, lower=True, oov_token='<OOV>')

In [14]:
#before applying tokenizer, split data
train_text = text[:len(text)-5000]
test_text=text[-5000:]
print(len(text), len(train_text), len(test_text))

50000 45000 5000


In [15]:
tokenizer.fit_on_texts(train_text)

In [16]:
tokenizer.word_index

{'<OOV>': 1,
 'i': 2,
 'the': 3,
 'movie': 4,
 'film': 5,
 'it': 6,
 'like': 7,
 'this': 8,
 'good': 9,
 'time': 10,
 'story': 11,
 'bad': 12,
 'great': 13,
 'people': 14,
 'dont': 15,
 'movies': 16,
 'and': 17,
 'films': 18,
 'way': 19,
 'but': 20,
 'characters': 21,
 'think': 22,
 'watch': 23,
 'its': 24,
 'in': 25,
 'a': 26,
 'seen': 27,
 'character': 28,
 'love': 29,
 'plot': 30,
 'acting': 31,
 'best': 32,
 'know': 33,
 'little': 34,
 'life': 35,
 'there': 36,
 'better': 37,
 'if': 38,
 'end': 39,
 'scene': 40,
 'man': 41,
 'scenes': 42,
 's': 43,
 'he': 44,
 'im': 45,
 'real': 46,
 'watching': 47,
 'thing': 48,
 'actors': 49,
 'doesnt': 50,
 'didnt': 51,
 'years': 52,
 'funny': 53,
 'actually': 54,
 'makes': 55,
 'work': 56,
 'look': 57,
 'director': 58,
 'going': 59,
 'one': 60,
 'lot': 61,
 'you': 62,
 'new': 63,
 'old': 64,
 'thats': 65,
 'things': 66,
 'want': 67,
 'cast': 68,
 'pretty': 69,
 'all': 70,
 'world': 71,
 'young': 72,
 'horror': 73,
 'got': 74,
 'fact': 75,
 'wha

In [17]:
word_counts = tokenizer.word_counts

In [18]:
print(word_counts['great'])

16197


In [19]:
print(word_counts['movie'])

77335


In [20]:
len(tokenizer.word_index)

141695

GloVe Embeddings

In [21]:
#get embbedings for these words
#glove vectors
#dictionary with word and corresponding word vector as key, value pair
embeddings_index = dict()
glove=open('glove.6B.100d.txt','r',encoding='utf-8')
for line in glove:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:],dtype='float32')
  embeddings_index[word]=coefs

glove.close()

In [22]:
len(embeddings_index)

400000

In [23]:
coefs.shape

(100,)

In [24]:
len(coefs)

100

In [25]:
vocab=len(tokenizer.word_index)+1
embedding_matrix = np.zeros((vocab,100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
embedding_matrix.shape

(141696, 100)

RNN with GloVe embeddings

In [27]:
train_indices=tokenizer.texts_to_sequences(train_text)

In [28]:
train_indices[0:2]

[[60,
  1826,
  913,
  47,
  2842,
  256,
  321,
  3021,
  88,
  84,
  460,
  436,
  202,
  3,
  48,
  3174,
  2842,
  5290,
  1,
  42,
  434,
  150,
  84,
  523,
  888,
  1564,
  1,
  5571,
  1,
  8,
  2319,
  5684,
  5229,
  1300,
  248,
  434,
  24,
  3278,
  213,
  211,
  523,
  6,
  336,
  2842,
  1,
  215,
  1,
  6603,
  2454,
  928,
  1,
  6,
  2532,
  1260,
  1,
  390,
  4621,
  2325,
  1028,
  6490,
  2809,
  1,
  259,
  1,
  1,
  188,
  4796,
  3405,
  390,
  205,
  981,
  1,
  7808,
  1,
  1,
  4836,
  7744,
  2282,
  500,
  83,
  1,
  191,
  1,
  7022,
  1,
  1,
  1,
  106,
  113,
  2,
  143,
  1144,
  75,
  133,
  128,
  417,
  2979,
  666,
  69,
  1118,
  4262,
  2371,
  923,
  666,
  1265,
  666,
  717,
  2842,
  50,
  796,
  1563,
  3,
  256,
  2,
  93,
  3174,
  1426,
  2083,
  2,
  2,
  1416,
  2,
  146,
  2,
  1349,
  1123,
  2842,
  74,
  1,
  188,
  1912,
  2038,
  434,
  82,
  434,
  7328,
  6794,
  4717,
  1,
  2818,
  1,
  6433,
  1,
  360,
  472,
  113,
  1,
  

In [29]:
#fixing sentence size to 80. Padding if lesser, truncating if more
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 80
train_indices=pad_sequences(train_indices, maxlen=max_length, padding='post')

In [30]:
train_indices[0:2]

array([[ 106,  113,    2,  143, 1144,   75,  133,  128,  417, 2979,  666,
          69, 1118, 4262, 2371,  923,  666, 1265,  666,  717, 2842,   50,
         796, 1563,    3,  256,    2,   93, 3174, 1426, 2083,    2,    2,
        1416,    2,  146,    2, 1349, 1123, 2842,   74,    1,  188, 1912,
        2038,  434,   82,  434, 7328, 6794, 4717,    1, 2818,    1, 6433,
           1,  360,  472,  113,    1,  619,  659, 6433,  518, 1028,    1,
         414,  766, 1847, 1028,  401,   47, 2842, 3545, 3103,  646,   65,
        1043, 3928, 2433],
       [ 221,    3, 1187, 2765,    1,    1, 1467,  262,    1,    1,  145,
        1714,  297,  268,    3,   49,  408, 2140,  361, 4314, 2372,   74,
           1, 2088, 3175,   62,  223,    1,  638,    1, 1683, 1569, 7088,
        6340,  140,   47,    1,  296, 2230,  268,   26, 4263,  221,   13,
        3089,   86,   35,    3, 1714,  135,  205,   34,   66,  897, 2766,
         211, 2028,  840, 2945, 1099, 1012, 4837,    6,  158, 1706, 4241,
         43

In [31]:
len(train_indices)

45000

In [32]:
#creating model architecture
RNNmodel = Sequential()
RNNmodel.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length, weights=[embedding_matrix],trainable=False))
RNNmodel.add(SimpleRNN(32))

RNNmodel.add(Dense(1,activation='sigmoid'))
RNNmodel.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(RNNmodel.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 100)           14169600  
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                4256      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 14,173,889
Trainable params: 4,289
Non-trainable params: 14,169,600
_________________________________________________________________
None


In [33]:
df['sentiment'].replace(['positive','negative'],[1,0],inplace=True)
df['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [34]:
Y=list(df['sentiment'])
train_y=Y[:len(Y)-5000]
test_y=Y[-5000:]
print(len(train_y),len(test_y))

45000 5000


In [35]:
train_indices=np.asarray(train_indices)
train_y=np.asarray(train_y)

In [36]:
len(train_indices)

45000

In [37]:
len(train_y)

45000

In [38]:
RNNmodel.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22d074dd730>

In [39]:
test_indices=tokenizer.texts_to_sequences(test_text)
max_length=80
test_indices=pad_sequences(test_indices, maxlen=max_length, padding='post')
test_indices=np.asarray(test_indices)
test_y=np.asarray(test_y)

In [40]:
scores = RNNmodel.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 69.36%


In [41]:
scores

[0.6168180704116821, 0.6935999989509583]

LSTM with GloVe embeddings

In [57]:
#creating model architecture
LSTMmodel = Sequential()
LSTMmodel.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length, weights=[embedding_matrix],trainable=False))
LSTMmodel.add(LSTM(32))

LSTMmodel.add(Dense(1,activation='sigmoid'))
LSTMmodel.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(LSTMmodel.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 80, 100)           14169600  
                                                                 
 lstm_1 (LSTM)               (None, 32)                17024     
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 14,186,657
Trainable params: 17,057
Non-trainable params: 14,169,600
_________________________________________________________________
None


In [58]:
LSTMmodel.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22f8d1c9be0>

In [59]:
scores = LSTMmodel.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 85.74%


In [60]:
scores

[0.337950199842453, 0.8574000000953674]