In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import pandas as pd


file_path = '/content/drive/MyDrive/IMDB Dataset.csv' # IMPORTANT: Update this path if the file is in a different location in your Drive
df = pd.read_csv(file_path)

print(f"Successfully loaded data from: {file_path}")
display(df.head())

Successfully loaded data from: /content/drive/MyDrive/IMDB Dataset.csv


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [25]:
sentiment_mapping = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)

print("Label encoding applied to 'sentiment' column:")
display(df.head())

Label encoding applied to 'sentiment' column:


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [26]:
# Install scikit-learn if not already installed (uncomment and run if needed)
# !pip install scikit-learn

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


# Extract text reviews and labels from the DataFrame
texts = df['review']
labels = df['sentiment'].values

print(f"Number of reviews: {len(texts)}")
print(f"Number of labels: {len(labels)}")

Number of reviews: 50000
Number of labels: 50000


In [28]:
# 2. Bag of Words (CountVectorizer)
bow_vectorizer = CountVectorizer(lowercase=True, stop_words="english")
X_bow = bow_vectorizer.fit_transform(texts)

print("BoW shape:", X_bow.shape)
print("Sample vocabulary (first 20):", list(bow_vectorizer.vocabulary_.keys())[:20])

BoW shape: (50000, 101583)
Sample vocabulary (first 20): ['reviewers', 'mentioned', 'watching', 'just', 'oz', 'episode', 'll', 'hooked', 'right', 'exactly', 'happened', 'br', 'thing', 'struck', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'word']


In [29]:
# 3. TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(texts)

print("TF-IDF shape:", X_tfidf.shape)
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Example terms (first 15):", feature_names[:15])

TF-IDF shape: (50000, 101583)
Example terms (first 15): ['00' '000' '00000000000' '0000000000001' '00000001' '00001' '00015'
 '000dm' '000s' '001' '003830' '006' '0069' '007' '0079']


In [30]:
# 4. Train/test split
X_bow_train, X_bow_test, y_train, y_test = train_test_split(
    X_bow, labels, test_size=0.3, random_state=42, stratify=labels
)

X_tfidf_train, X_tfidf_test, _, _ = train_test_split(
    X_tfidf, labels, test_size=0.3, random_state=42, stratify=labels
)

# 4a. Logistic Regression with BoW
bow_clf = LogisticRegression(max_iter=1000)
bow_clf.fit(X_bow_train, y_train)

y_pred_bow = bow_clf.predict(X_bow_test)

print("\n=== BoW + Logistic Regression ===")
print(classification_report(y_test, y_pred_bow, target_names=["negative","positive"]))
print("Confusion matrix (BoW):\n", confusion_matrix(y_test, y_pred_bow))

# 4b. Logistic Regression with TF-IDF
tfidf_clf = LogisticRegression(max_iter=1000)
tfidf_clf.fit(X_tfidf_train, y_train)

y_pred_tfidf = tfidf_clf.predict(X_tfidf_test)

print("\n=== TF-IDF + Logistic Regression ===")
print(classification_report(y_test, y_pred_tfidf, target_names=["negative","positive"]))
print("Confusion matrix (TF-IDF):\n", confusion_matrix(y_test, y_pred_tfidf))


=== BoW + Logistic Regression ===
              precision    recall  f1-score   support

    negative       0.89      0.89      0.89      7500
    positive       0.89      0.89      0.89      7500

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

Confusion matrix (BoW):
 [[6640  860]
 [ 825 6675]]

=== TF-IDF + Logistic Regression ===
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      7500
    positive       0.89      0.91      0.90      7500

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

Confusion matrix (TF-IDF):
 [[6637  863]
 [ 706 6794]]


In [32]:
# 5. Using the trained TF-IDF model
new_reviews = [
    "Battery life is horrible and the phone is slow",
    "Fantastic device, I am very satisfied with the performance"
]

# Vectorize with the same fitted TF-IDF vectorizer
new_X = tfidf_vectorizer.transform(new_reviews)
new_preds = tfidf_clf.predict(new_X)
new_probs = tfidf_clf.predict_proba(new_X)

for review, label, prob in zip(new_reviews, new_preds, new_probs):
    sentiment = "positive" if label == 1 else "negative"
    print(f"\nReview: {review}")
    print(f"Predicted sentiment: {sentiment} (prob={prob[label]:.2f})")


Review: Battery life is horrible and the phone is slow
Predicted sentiment: negative (prob=0.87)

Review: Fantastic device, I am very satisfied with the performance
Predicted sentiment: positive (prob=0.92)
