# **Sentiment Analysis on IMDB Reviews using LSTM**


```



In [1]:
!pip install nltk
!pip install keras
!pip install tensorflow



In [2]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [3]:
data = pd.read_csv('IMDB Dataset.csv', engine='python', on_bad_lines='skip')
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
19786  this movie is just great. if you have a chance...  positive
19787  Ever since seeing this film as a child, over 3...  positive
19788  Sadly a great opportunity to utilise a superb ...  negative
19789  The Beatles had just done 'Magical Mystery Tou...  positive
19790  An ultra-nervous old man, "Mr. Goodrich," terr...  positive

[19791 rows x 2 columns]


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
english_stops = set(stopwords.words('english'))

In [7]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv', engine='python', on_bad_lines='skip')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
45550    [it, since, i, saw, movie, i, still, remember,...
18869    [went, watch, movie, expecting, nothing, reall...
852      [i, one, think, good, thing, i, never, rosalin...
49497    [steamboat, willie, amazingly, important, film...
43909    [a, yrs, ago, i, remember, reading, essay, fem...
                               ...                        
34654    [i, stumbled, across, film, channel, surfing, ...
2131     [even, longtime, shirley, fans, may, surprised...
43992    [i, barely, made, one, episode, crouch, end, t...
33519    [a, brilliant, chess, player, attends, tournam...
32561    [i, never, really, interested, cannibal, movie...
Name: review, Length: 40000, dtype: object 

5976     [oh, gosh, i, love, movie, soooooooooooooooooo...
6994     [this, second, experience, monkey, island, ser...
41354    [sloppily, directed, witless, comedy, supposed...
33969    [retro, puppet, master, complete, utter, crap,...
41385    [this, film, half, bad, it, may, little, long,...
 

In [9]:
def get_max_length():
  review_length = []
  for review in x_train:
    review_length.append(len(review))

  return int(np.ceil(np.mean(review_length)))


In [10]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   7  138    1 ...    0    0    0]
 [ 320   34    3 ...    0    0    0]
 [   1    5   30 ...    0    0    0]
 ...
 [   1 1064   24 ...    0    0    0]
 [  39  418 4679 ...    0    0    0]
 [   1   40   15 ...    0    0    0]] 

Encoded X Test
 [[  342  8014     1 ...     0     0     0]
 [    8   242   459 ...     0     0     0]
 [16887   421 10160 ...     0     0     0]
 ...
 [  366    12  6432 ...     0     0     0]
 [ 1763 17195   206 ...     0     0     0]
 [  967   967   967 ...   316  4536  5573]] 

Maximum review length:  130


In [11]:
#architecture
embed_dim = 32
lstm_out = 64
model = Sequential()
model.add(Embedding(total_words, embed_dim, input_length = max_length))
model.add(LSTM(lstm_out))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())




None


In [12]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor = 'accuracy',
    save_best_only = True,
    verbose = 1
)


In [13]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks = [checkpoint])

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - accuracy: 0.5795 - loss: 0.6546
Epoch 1: accuracy improved from -inf to 0.66325, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 189ms/step - accuracy: 0.5798 - loss: 0.6544
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step - accuracy: 0.7914 - loss: 0.5082
Epoch 2: accuracy improved from 0.66325 to 0.80020, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 191ms/step - accuracy: 0.7914 - loss: 0.5082
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step - accuracy: 0.7000 - loss: 0.5640
Epoch 3: accuracy did not improve from 0.80020
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 186ms/step - accuracy: 0.7000 - loss: 0.5640
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step - accuracy: 0.6578 - loss: 0.6010
Epoch 4: accuracy did not improve from 0.80020
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 187ms/step - accuracy: 0.6577 - loss: 0.6012
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step - accuracy: 0.6711 - loss: 0.6111
Epoch 5: accuracy did not improve from 0.80020
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 185ms/step - accuracy: 0.6711 - loss: 0.6111


<keras.src.callbacks.history.History at 0x7c6198878830>

In [16]:
y_pred = model.predict(x_test, batch_size = 128)
y_pred = (y_pred > 0.5).astype(int) # Convert probabilities to binary predictions

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step
Correct Prediction: 7114
Wrong Prediction: 2886
Accuracy: 71.14


In [17]:
loaded_model = load_model('models/LSTM.h5')




In [22]:
review = str(input('Movie Review: '))


Movie Review: Overall movie was awesome but actors weren't upto the mark


In [23]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  Overall movie was awesome but actors werent upto the mark
Filtered:  ['overall movie awesome actors werent upto mark']


In [24]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  350     3  1052    67 53075 28861   842     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


In [27]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[[0.5435723]]


In [26]:
if result >= 0.7:
  print('positive')
else:
  print('negative')


negative
