In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

<hr>
<i>Preview dataset</i>

In [3]:
data = pd.read_csv('/content/drive/MyDrive/LSTM/IMDB Dataset.csv')

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]



<i>Declaring the english stop words</i>

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
english_stops = set(stopwords.words('english'))

<hr>

### Load and Clean Dataset


### Encode Sentiments
In the same function, I also encode the sentiments into integers (0 and 1). Where 0 is for negative sentiments and 1 is for positive sentiments.

In [6]:
def load_dataset():
    df = pd.read_csv('/content/drive/MyDrive/LSTM/IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


<hr>

### Split Dataset


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
3099     [crossfire, remembered, much, fact, three, sta...
37853    [this, story, long, awkward, love, the, daily,...
16752    [i, vague, memories, movie, funny, having, see...
2840     [star, rating, saturday, night, friday, night,...
11770    [this, film, exactly, title, describes, attemp...
                               ...                        
39388    [i, understand, everyone, hates, movie, aside,...
4804     [escanaba, da, moonlight, first, showcasing, j...
32250    [loved, story, guy, tries, get, girl, back, do...
17967    [the, pre, release, version, baby, face, would...
17817    [the, scottish, artist, andy, goldsworthy, fas...
Name: review, Length: 40000, dtype: object 

36857    [you, expect, movie, like, good, it, budget, u...
45187    [as, others, mentioned, movie, similar, the, f...
39528    [this, film, late, night, i, saw, it, interest...
3108     [i, always, wanted, david, duchovney, go, movi...
12744    [this, horrible, movie, all, three, stories, b...
 

<hr>
<i>Function for getting the maximum review length, by calculating the mean of all the reviews length (using <b>numpy.mean</b>)</i>

In [8]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

<hr>

### Tokenize and Pad/Truncate Reviews



In [9]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[8743 1923   17 ... 1592  183  275]
 [   8   13  100 ...   11  954   73]
 [   1 3475 1742 ...    0    0    0]
 ...
 [ 346   13  116 ...    0    0    0]
 [   2 1725  656 ... 4681 5319 4616]
 [   2 4450 1621 ...   16 3183 3789]] 

Encoded X Test
 [[ 103  433    3 ...    0    0    0]
 [ 108  304  959 ...   52 5189 4541]
 [   8    4  430 ...    0    0    0]
 ...
 [   2   23  482 ...    0    0    0]
 [ 333   47    4 ...    0    0    0]
 [   1   47  152 ...    0    0    0]] 

Maximum review length:  130


<hr>

### Build Architecture/Model


In [10]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2953664   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2978561 (11.36 MB)
Trainable params: 2978561 (11.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


<hr>

### Training


In [11]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [12]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.74145, saving model to models/LSTM.h5
Epoch 2/5


  saving_api.save_model(


Epoch 2: accuracy improved from 0.74145 to 0.92325, saving model to models/LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.92325 to 0.96038, saving model to models/LSTM.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.96038 to 0.97943, saving model to models/LSTM.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.97943 to 0.98563, saving model to models/LSTM.h5


<keras.src.callbacks.History at 0x7ad645bdf070>

<hr>

### Testing


In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2953664   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2978561 (11.36 MB)
Trainable params: 2978561 (11.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
y_pred = model.predict(x_test, batch_size = 128).argmax(axis=1)



---

### Load Saved Model

Load saved model and use it to predict a movie review statement's sentiment (positive or negative).

In [16]:
loaded_model = load_model('models/LSTM.h5')

Receives a review as an input to be predicted

In [17]:
review = str(input('Movie Review: '))

Movie Review:  "Bahubali" fails to live up to its hype, offering little beyond flashy visuals and exaggerated action sequences. The film's plot is predictable and riddled with clichés, relying heavily on tired tropes of revenge and royalty. Character development is lacking, with one-dimensional protagonists and antagonists that fail to evoke empathy or interest. The dialogue is often cringe-worthy, filled with melodramatic speeches and wooden delivery. Despite its epic scale, the film struggles to maintain momentum, bogged down by unnecessary subplots and prolonged fight scenes. The female characters are particularly disappointing, reduced to mere props in a predominantly male-centric narrative. Overall, "Bahubali" is a superficial spectacle that prioritizes style over substance, ultimately leaving audiences unsatisfied and underwhelmed.


The input must be pre processed before it is passed to the model to be predicted

In [22]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:   Bahubali fails to live up to its hype offering little beyond flashy visuals and exaggerated action sequences The films plot is predictable and riddled with clichs relying heavily on tired tropes of revenge and royalty Character development is lacking with onedimensional protagonists and antagonists that fail to evoke empathy or interest The dialogue is often cringeworthy filled with melodramatic speeches and wooden delivery Despite its epic scale the film struggles to maintain momentum bogged down by unnecessary subplots and prolonged fight scenes The female characters are particularly disappointing reduced to mere props in a predominantly malecentric narrative Overall Bahubali is a superficial spectacle that prioritizes style over substance ultimately leaving audiences unsatisfied and underwhelmed
Filtered:  [' bahubali fails live hype offering little beyond flashy visuals exaggerated action sequences the films plot predictable riddled clichs relying heavily tired tropes re

In [23]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  905   323  3338  3854    48   576  6277  2026  3991   110   737     2
     35    40   619 10722  8350  2517  1342 19216  1070 12681    34   898
   1763  3132 15285  1838  8095  5524   521     2   319   310 18223   950
   3351  5546  1506  2689   372  1442  2233     4  2920  4325  7396 13435
   1705  4390  9216   456    60     2   548    29   478  1250  3992  2715
   4297 15624  1297   350  3791  5809   309  2285  1088  1111  1120 12248
  23102     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


This is the result of the prediction which shows the **confidence score** of the review statement.

In [24]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.00170819]]


If the confidence score is close to 0, then the statement is **negative**. On the other hand, if the confidence score is close to 1, then the statement is **positive**.here consider threshold of **0.7** to determine which confidence score is positive and negative, so if it is equal or greater than 0.7, it is **positive** and if it is less than 0.7, it is **negative**

In [25]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

negative
