In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from nltk.corpus import movie_reviews
import random
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

In [2]:
# Download NLTK data
nltk.download('movie_reviews')
nltk.download('punkt_tab')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Define tokenizer function with stemming and cleaning
def tokenize_and_stem(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove usernames
    text = re.sub(r'@\w+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Stem
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

In [4]:
# Prepare the dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

# Extract text and labels
texts = [' '.join(words) for words, label in documents]
labels = [label for words, label in documents]

print(f"Total documents: {len(texts)}")
print(f"Positive reviews: {labels.count('pos')}")
print(f"Negative reviews: {labels.count('neg')}")

Total documents: 2000
Positive reviews: 1000
Negative reviews: 1000


In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 1600
Test set size: 400


In [6]:
# Feature extraction with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', tokenizer=tokenize_and_stem)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF feature shape: {X_train_tfidf.shape}")



TF-IDF feature shape: (1600, 5000)


In [7]:
# Model construction
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

print("Model trained successfully.")

Model trained successfully.


In [8]:
# Prediction and evaluation
y_pred = nb_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7875

Classification Report:
              precision    recall  f1-score   support

         neg       0.74      0.86      0.80       195
         pos       0.84      0.72      0.78       205

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.80      0.79      0.79       400


Confusion Matrix:
[[168  27]
 [ 58 147]]


In [9]:
# Prediction on new queries
new_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout.",
    "I hated this film. It was boring, the characters were poorly developed, and the ending was disappointing.",
    "An average movie with some good moments but overall nothing special.",
    "One of the best films I've seen this year. Highly recommend it!"
]

new_reviews_tfidf = vectorizer.transform(new_reviews)
new_predictions = nb_classifier.predict(new_reviews_tfidf)

for review, pred in zip(new_reviews, new_predictions):
    print(f"Review: {review[:50]}...")
    print(f"Predicted sentiment: {pred}")
    print()

Review: This movie was absolutely fantastic! The acting wa...
Predicted sentiment: pos

Review: I hated this film. It was boring, the characters w...
Predicted sentiment: neg

Review: An average movie with some good moments but overal...
Predicted sentiment: pos

Review: One of the best films I've seen this year. Highly ...
Predicted sentiment: pos



In [10]:
# Results and analysis
print("### Results and Analysis ###")

# Example predictions (already done above)

# Analysis of misclassified samples
misclassified_indices = [i for i, (true, pred) in enumerate(zip(y_test, y_pred)) if true != pred]
print(f"\nNumber of misclassified samples: {len(misclassified_indices)}")

print("\nExample misclassified samples:")
for i in misclassified_indices[:5]:  # Show first 5
    print(f"True label: {y_test[i]}, Predicted: {y_pred[i]}")
    print(f"Review: {X_test[i][:100]}...")
    print()

# Insights
print("### Insights ###")
print("1. The Multinomial Naive Bayes classifier performed reasonably well on the movie review dataset.")
print("2. TF-IDF features capture the importance of words in the context of the entire corpus.")
print("3. The model tends to misclassify reviews that contain mixed sentiments or sarcasm.")
print("4. Stop words removal and limiting features to 5000 helped in reducing noise.")
print("5. Further improvements could include n-gram features, better preprocessing, or using more advanced models.")

### Results and Analysis ###

Number of misclassified samples: 85

Example misclassified samples:
True label: pos, Predicted: neg
Review: the makers of jurassic park & the director of speed conjure up a storm ! ( reviewed at eng wah ' s n...

True label: pos, Predicted: neg
Review: if chris farley had strapped some fake mutton - chop sideburns to each side of his head , spoken wit...

True label: pos, Predicted: neg
Review: a lot of times a three - star film will be my favorite . they ' re usually the kind of movie i can w...

True label: pos, Predicted: neg
Review: the trailers and the beginning of the move sum up this plot very easily . three filmmakers venture i...

True label: pos, Predicted: neg
Review: i have a soft spot in my heart for pure , amoral sleaze . i liked showgirls ( there , i said it ) . ...

### Insights ###
1. The Multinomial Naive Bayes classifier performed reasonably well on the movie review dataset.
2. TF-IDF features capture the importance of words in the conte