<a href="https://colab.research.google.com/github/nourkebbi/imbd-sentiment-analysis/blob/main/IMBD_sentiment_analysis_data_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing all libraries

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd

# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# For model building
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

Importing dataset

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 156MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


Read the csv file

In [5]:
df = pd.read_csv(f"{path}/IMDB Dataset.csv")

Break review column into tokens and clean the review column

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
df.review[3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [8]:
import re
import pandas as pd
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
import string,time
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
exclude = string.punctuation
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [10]:
test = 'I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs.'


def preprocess_text(text):
    text = text.lower()
    text = remove_url(text)
    text = remove_punc(text)
    text = stem_words(text)
    text = nltk.word_tokenize(text)
    text = ' '.join(text)
    return text




In [11]:
df['review'] = df['review'].apply(preprocess_text)

Testing

In [12]:
df.review[3]

'basic there a famili where a littl boy jake think there a zombi in hi closet hi parent are fight all the timebr br thi movi is slower than a soap opera and suddenli jake decid to becom rambo and kill the zombiebr br ok first of all when your go to make a film you must decid if it a thriller or a drama as a drama the movi is watchabl parent are divorc argu like in real life and then we have jake with hi closet which total ruin all the film i expect to see a boogeyman similar movi and instead i watch a drama with some meaningless thriller spotsbr br 3 out of 10 just for the well play parent descent dialog as for the shot with jake just ignor them'

Tokenizing the data

In [13]:
tokenizer = Tokenizer(num_words=10000) #tokenizing number of words top 10000 into pieces
tokenizer.fit_on_texts(df['review'].values) #removing duplicates of tokens in review text
X = tokenizer.texts_to_sequences(df['review'].values) #assigning an Integer to each token

Pad sequences

In [14]:
X = pad_sequences(X, maxlen=100) #making all sequences are of the same length - here its 100

Converting positive/negative to int

In [15]:
y = df['sentiment'].replace({'positive': 1, 'negative': 0})

  y = df['sentiment'].replace({'positive': 1, 'negative': 0})


Create and compile model

Define model architecture

In [19]:
model = Sequential([  #model used for sentimental analysis and text classification
    Embedding(10000, 64), #classifies each word to 64 vectors
    LSTM(64), #long-short-term-memory understands the context behind the words
    Dense(1, activation='sigmoid') #classifies each word between 0 and 1 (negative or positive)
])

Compiling the model

In [20]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#compiles the models - loss is how much wrong the sentiment is - measures the difference between the output and the correct sentiment
#optimizer adam is the things that improves the quality of the output
#metrics: how much the data is correct with respect to the data from IMBD pos/neg

Train the model

Split data

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# x is the input, y is the output values
#20% of the data is used for validation and 80% for training
#random state - random num generator to ensure the split happens in the same way everytime

Train the model

In [25]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
# x_train is input to be trained
# y_train is output to be learnt from
# epochs is 10 - number of times the model is trained
# batch size is the number of samples trained each time
# validation data is a function that monitors the model performance


Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 55ms/step - accuracy: 0.9913 - loss: 0.0305 - val_accuracy: 0.8434 - val_loss: 0.6281
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 54ms/step - accuracy: 0.9934 - loss: 0.0239 - val_accuracy: 0.8513 - val_loss: 0.6927
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 54ms/step - accuracy: 0.9942 - loss: 0.0220 - val_accuracy: 0.8524 - val_loss: 0.7550
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 57ms/step - accuracy: 0.9962 - loss: 0.0149 - val_accuracy: 0.8534 - val_loss: 0.8322
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 55ms/step - accuracy: 0.9946 - loss: 0.0187 - val_accuracy: 0.8527 - val_loss: 0.8055
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 54ms/step - accuracy: 0.9956 - loss: 0.0144 - val_accuracy: 0.8486 - val_loss: 0.7789
Epoc

<keras.src.callbacks.history.History at 0x78cfc42ac040>

Evaluate data on Validation set

In [26]:
loss, accuracy = model.evaluate(X_val, y_val)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)
#using the trained data to evaluate the loss and the accuracy

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8378 - loss: 0.8957
Test Loss: 0.8892940282821655
Test Accuracy: 0.8381999731063843


Making predictions

In [33]:
new_review = "This movie is omg!"

Tokenize and pad new text

In [34]:
new_seq = tokenizer.texts_to_sequences([new_review])
new_padded_seq = pad_sequences(new_seq, maxlen=100)

Make prediction

In [35]:
prediction = model.predict(new_padded_seq)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[[1.1786375e-05]]
