In [46]:
import pandas as pandas
import numpy as numpy


import warnings 
warnings.filterwarnings("ignore")

1) Importing dataset

In [47]:
data = pandas.read_csv("IMDB Dataset.csv")

2. Reviewing the data

In [48]:
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [49]:
# Get dataset shape
rows, cols = data.shape

# Get column names as a list
col_names = data.columns.tolist()

# Note down the type of outputs 
note = f"The dataset contains {rows} input{'s' if rows != 1 else ''} and {cols} column{'s' if cols != 1 else ''}: {', '.join(col_names)}."

# Print the note
print(note)

The dataset contains 50000 inputs and 2 columns: review, sentiment.


In [50]:
type(data)

pandas.core.frame.DataFrame

In [51]:
data.replace({"sentiment": {"positive":1, "negative":0}}, inplace=True)

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


3.  Reviewing new data entries

In [52]:
data["sentiment"].value_counts()


sentiment
1    25000
0    25000
Name: count, dtype: int64

In [53]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


In [54]:
from sklearn.model_selection import train_test_split 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [55]:
train_data,test_data = train_test_split(data, test_size= 0.2, random_state=42)

In [56]:
train_data.shape

(40000, 2)

In [57]:
test_data.shape

(10000, 2)

In [58]:
tokenizer = Tokenizer(num_words= 5000 )

test = tokenizer.fit_on_texts("I really love SIT I love you so much!!!!!")

print(test)

tokenizer.fit_on_texts(train_data["review"])

None


In [59]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen= 200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen= 200)



In [60]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

Y_train


39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

BUILDING OUR LSTM MODEL

In [62]:
model = Sequential()

model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.build(input_shape=(None, 200))
model.summary()

Training our model

In [66]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train,Y_train,epochs=5,batch_size=64,validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 258ms/step - accuracy: 0.7654 - loss: 0.5043 - val_accuracy: 0.7041 - val_loss: 0.6113
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 348ms/step - accuracy: 0.8394 - loss: 0.3819 - val_accuracy: 0.8558 - val_loss: 0.3412
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 367ms/step - accuracy: 0.8640 - loss: 0.3336 - val_accuracy: 0.8541 - val_loss: 0.3480
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 406ms/step - accuracy: 0.8917 - loss: 0.2740 - val_accuracy: 0.8715 - val_loss: 0.3170
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 405ms/step - accuracy: 0.9016 - loss: 0.2522 - val_accuracy: 0.8740 - val_loss: 0.3157


<keras.src.callbacks.history.History at 0x144d7d0a770>

Optimizing my model

In [69]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Define EarlyStopping callback
es = EarlyStopping(
    monitor='val_loss',   # what to watch
    patience=3,           # stop after 3 epochs with no improvement
    restore_best_weights=True
)

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train model with callback
model.fit(
    X_train, Y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    callbacks=[es]
)


Epoch 1/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 147ms/step - accuracy: 0.9143 - loss: 0.2187 - val_accuracy: 0.8771 - val_loss: 0.3086
Epoch 2/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 146ms/step - accuracy: 0.9230 - loss: 0.1989 - val_accuracy: 0.8687 - val_loss: 0.3506
Epoch 3/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 147ms/step - accuracy: 0.9355 - loss: 0.1722 - val_accuracy: 0.8717 - val_loss: 0.3440
Epoch 4/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 152ms/step - accuracy: 0.9404 - loss: 0.1585 - val_accuracy: 0.8746 - val_loss: 0.3506


<keras.src.callbacks.history.History at 0x144d7d0b160>

In [71]:
model.save("model.h5")



In [75]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [72]:
loss,accuracy = model.evaluate(X_test,Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.8828 - loss: 0.2982


In [74]:
print(loss)
print(accuracy)

0.2981775104999542
0.8827999830245972


Building the predictive system


In [120]:
def predictive_system(review):
   sequences = tokenizer.texts_to_sequences([review])
   padded_sequence = pad_sequences(sequences, maxlen=200)
   prediction = model.predict(padded_sequence)
   sentiment = "positive" if prediction[0][0]>0.5 else "negative"
   return sentiment

In [127]:
predictive_system("i fucking love my school SIT ")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'positive'