In [24]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
import nltk
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM,GRU, Dense,Dropout # layers of the architecture
import re

In [25]:
data = pd.read_csv('IMDB Dataset.csv')
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [26]:
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [28]:
y_data.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [29]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.25)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
23432    [be, warned, this, movie, mess, it, catastroph...
41131    [and, one, opinion, that, reason, margaret, le...
30151    [what, this, must, without, doubt, biggest, wa...
7049     [okay, first, good, thing, if, saw, trailer, k...
44613    [in, opening, scene, eye, patch, wearing, desp...
                               ...                        
29041    [i, remember, parents, understanding, saturday...
24188    [even, awful, much, film, everything, form, st...
10520    [a, slow, tedious, one, dimensional, movie, go...
21777    [i, saw, movie, years, ago, rather, fond, memo...
14711    [this, movie, could, great, many, unnecessary,...
Name: review, Length: 37500, dtype: object 

35920    [this, amazing, movie, actors, actresses, good...
37408    [my, dad, fan, columbo, i, always, disliked, s...
4829     [a, tight, knit, musical, family, cranky, bene...
6415     [so, bad, good, script, obvious, acting, poor,...
16540    [it, occurred, final, scene, movie, froze, rev...
 

In [30]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [31]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[2217 2621    8 ...   63  130    1]
 [  31    5  557 ...    0    0    0]
 [ 106    8  112 ...  234 2571  100]
 ...
 [  40  467 2227 ...    0    0    0]
 [   1  118    3 ...    0    0    0]
 [   8    3   27 ...    0    0    0]] 

Encoded X Test
 [[    8   398     3 ...     0     0     0]
 [  217  1145   232 ...   214   604    12]
 [   40  2585 12324 ...     0     0     0]
 ...
 [ 4436  3175   212 ...     0     0     0]
 [  767     1   118 ...     0     0     0]
 [  108   766   764 ...     0     0     0]] 

Maximum review length:  130


In [32]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(LSTM_OUT))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())



None


In [33]:
model.fit(x_train, y_train, batch_size = 128, epochs = 10)

Epoch 1/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.5253 - loss: 0.6870
Epoch 2/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.6488 - loss: 0.6502
Epoch 3/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.5020 - loss: 0.7011
Epoch 4/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.5047 - loss: 0.6961
Epoch 5/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.5184 - loss: 0.6902
Epoch 6/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.6164 - loss: 0.6434
Epoch 7/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.8733 - loss: 0.3374
Epoch 8/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9385 - loss: 0.1837
Epoch 9/10
[1m293/293[0m [32m

<keras.src.callbacks.history.History at 0x7b8b02c4f760>

In [34]:
predict_x=model.predict(x_test)
y_pred=np.argmax(predict_x,axis=1)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
Correct Prediction: 6256
Wrong Prediction: 6244
Accuracy: 50.048


In [35]:
predict_x_train=model.predict(x_train)
y_pred_train=np.argmax(predict_x_train,axis=1)

true = 0
for i, y in enumerate(y_train):
    if y == y_pred_train[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred_train) - true))
print('Accuracy: {}'.format(true/len(y_pred_train)*100))

[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step
Correct Prediction: 18744
Wrong Prediction: 18756
Accuracy: 49.984
