SENTIMENT ANALYSIS WITH LSTM ON IMBD REVIEW DATASET

The dataset contains 50k records and is balanced with 25k positive feedback and 25k negative feedback

In [103]:
# importing python library packages for the analysis

import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

In [104]:
# reading dataset with pandas

df = pd.read_csv('imbd_dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [105]:
# analysing the dataset

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [106]:
df.shape

(50000, 2)

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [108]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [109]:
df.isna().any()

review       False
sentiment    False
dtype: bool

In [116]:
# processing and manipulating data to remove unwanted HTML tags, URLs etc using remove_tags function and Regex functions

def remove_tags(string):
    result = re.sub('<.*?>', '', string)                                                                               # remove HTML tags
    result = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', result, flags=re.MULTILINE)                    # remove URLs
    result = re.sub(r'[^A-Za-z0-9\s.]', r'', result)                                                                   # remove non-alphanumeric characters
    result = result.lower()
    return result
df['review'] = df['review'].apply(lambda x: remove_tags(x))

# removing stopwords

stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: ''.join([word for word in x.split() if word not in (stop_words)]))
df

Unnamed: 0,review,sentiment
0,onereviewersmentionedwatching1ozepisodeyoullho...,positive
1,wonderfullittleproduction.filmingtechniqueunas...,positive
2,thoughtwonderfulwayspendtimehotsummerweekendsi...,positive
3,basicallytheresfamilylittleboyjakethinkstheres...,negative
4,pettermatteislovetimemoneyvisuallystunningfilm...,positive
...,...,...
49995,thoughtmovierightgoodjob.wasntcreativeoriginal...,positive
49996,badplotbaddialoguebadactingidioticdirectingann...,negative
49997,catholictaughtparochialelementaryschoolsnunsta...,negative
49998,imgoingdisagreepreviouscommentsidemaltinone.se...,negative


In [118]:
# text lemmatization

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + ""
    return st
df['review'] = df.review.apply(lemmatize_text)

In [119]:
# checking if dataset is balanced before modeling

s = 0
for i in df['review']:
    word_list = i.split()
    s = s + len(word_list)
print("Avg length of each review: ",s/df.shape[0])

pos = 0
for i in range(df.shape[0]):
    if df.iloc[i]['sentiment'] == 'positive':
        pos = pos + 1
neg = df.shape[0] - pos
print("Positive rviews in percentage: ",str(pos/df.shape[0]*100),"%")
print("Negative rviews in percentage: ",str(neg/df.shape[0]*100),"%")

Avg length of each review:  1.0
Positive rviews in percentage:  50.0 %
Negative rviews in percentage:  50.0 %


In [121]:
# converting labels 'positive' and 'negative' into 1 and 0 respectively using LabelEncoder()

reviews = df['review'].values
labels = df['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels) 

# spliting dataset to train and test parts

train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels) 

# data padding and tokenizing before feeding the data to LSTM model

vocab_size = 3000                                                           # hyperparameters of the model
oov_tok = ''
embedding_dim = 100
max_length = 200
padding_type = 'post'
trucn_type = 'post'

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)          # tokenize reviews
tokenizer.fit_on_texts(train_reviews)
word_index = tokenizer.word_index

train_reviews = tokenizer.texts_to_sequences(train_reviews)                 # convert train dataset to sequence and pad sequences
train_padded = pad_sequences(train_reviews, padding ='post', maxlen=max_length)

test_reviews = tokenizer.texts_to_sequences(test_reviews)                   # convert test dataset to sequence and pad sequences
test_padded = pad_sequences(test_reviews, padding='post', maxlen=max_length)

In [122]:
# model building

model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# model compilation

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 200, 100)          300000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 24)                3096      
                                                                 
 dense_5 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387,601
Trainable params: 387,601
Non-trainable params: 0
_________________________________________________________________


In [115]:
# model training and evaluation

num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

prediction = model.predict(test_padded)
pred_labels = []                               # labels based on probability 1 if p >= 0.5 else 0
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print(pred_labels)
print("Accuracy of prediction on test set: ", accuracy_score(test_labels,pred_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0

Hence, the analysis predicts on test set = 58% 