In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import joblib
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the dataset
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
# Text preprocessing function
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [6]:
# Apply text cleaning to the reviews
df['cleaned_review'] = df['review'].apply(clean_text)


In [7]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [8]:
# Encode the sentiment labels (positive -> 1, negative -> 0)
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

In [9]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment'], test_size=0.2, random_state=42)

In [11]:
X_train.shape, X_test.shape

((40000,), (10000,))

In [12]:
# Define different embedding techniques

# 1. Bag of Words (CountVectorizer)
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)
joblib.dump(count_vectorizer, 'count_vectorizer.joblib')

['count_vectorizer.joblib']

In [13]:
X_train_bow[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
# 2. TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [15]:
X_train_tfidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# Define and train different models

# 1. Naive Bayes with Bag of Words
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)
y_pred_nb_bow = nb_bow.predict(X_test_bow)
print("Naive Bayes with Bag of Words Accuracy:", accuracy_score(y_test, y_pred_nb_bow))
print(classification_report(y_test, y_pred_nb_bow))
joblib.dump(nb_bow, 'nb_bow_model.joblib')

Naive Bayes with Bag of Words Accuracy: 0.8604
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



['nb_bow_model.joblib']

In [17]:
# 2. Naive Bayes with TF-IDF
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)
print("Naive Bayes with TF-IDF Accuracy:", accuracy_score(y_test, y_pred_nb_tfidf))
print(classification_report(y_test, y_pred_nb_tfidf))
joblib.dump(nb_tfidf, 'nb_tfidf_model.joblib')

Naive Bayes with TF-IDF Accuracy: 0.8685
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      4961
           1       0.88      0.86      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



['nb_tfidf_model.joblib']

In [18]:
# 3. Logistic Regression with Bag of Words
lr_bow = LogisticRegression(max_iter=1000)
lr_bow.fit(X_train_bow, y_train)
y_pred_lr_bow = lr_bow.predict(X_test_bow)
print("Logistic Regression with Bag of Words Accuracy:", accuracy_score(y_test, y_pred_lr_bow))
print(classification_report(y_test, y_pred_lr_bow))
joblib.dump(lr_bow, 'lr_bow_model.joblib')

Logistic Regression with Bag of Words Accuracy: 0.8863
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      4961
           1       0.88      0.89      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



['lr_bow_model.joblib']

In [19]:
# 4. Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)
print("Logistic Regression with TF-IDF Accuracy:", accuracy_score(y_test, y_pred_lr_tfidf))
print(classification_report(y_test, y_pred_lr_tfidf))
joblib.dump(lr_tfidf, 'lr_tfidf_model.joblib')

Logistic Regression with TF-IDF Accuracy: 0.8958
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



['lr_tfidf_model.joblib']

In [21]:
# Comparing results
print("\nModel Performance Comparison:")
models = ['Naive Bayes (BoW)', 'Naive Bayes (TF-IDF)', 
          'Logistic Regression (BoW)', 'Logistic Regression (TF-IDF)']
accuracies = [accuracy_score(y_test, y_pred_nb_bow),
              accuracy_score(y_test, y_pred_nb_tfidf),
              accuracy_score(y_test, y_pred_lr_bow),
              accuracy_score(y_test, y_pred_lr_tfidf)
                ]


Model Performance Comparison:


In [22]:
for model, accuracy in zip(models, accuracies):
    print(f"{model}: {accuracy:.4f}")

Naive Bayes (BoW): 0.8604
Naive Bayes (TF-IDF): 0.8685
Logistic Regression (BoW): 0.8863
Logistic Regression (TF-IDF): 0.8958


In [27]:
import pickle as pkl
pkl.dump(lr_tfidf, open("sentiment_model.pkl", "wb"))