In [37]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np

In [38]:
# NLTK data
nltk.download('movie_reviews')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
def preprocess(text):
    """
    Cleans text by removing handles, URLs, stopwords, and applying stemming.
    """
    # Remove handles (e.g., @username) and URLs
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Convert to lowercase
    tokens = [t.lower() for t in tokens]
    
    # Remove stop words and keep only alphabetic words
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    
    # Join back into string
    return ' '.join(tokens)

In [40]:
# Load the IMDB movie reviews dataset
reviews = []
labels = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        reviews.append(movie_reviews.raw(fileid))
        labels.append(0 if category == 'neg' else 1)  # 0 for negative, 1 for positive

# Preprocess all reviews
# (This might take a few seconds depending on dataset size)
processed_reviews = [preprocess(r) for r in reviews]

# Split into training and testing sets
processed_train, processed_test, y_train, y_test = train_test_split(
    processed_reviews, labels, test_size=0.2, random_state=42
)

# Feature extraction with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(processed_train)
X_test = vectorizer.transform(processed_test)

print("Data loaded and vectorized.")

Data loaded and vectorized.


In [41]:
# Train the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)
print("Model training complete.")

Model training complete.


In [42]:
# Predict on test set
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Create and save confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(
    y_test,
    y_pred,
    display_labels=["Negative", "Positive"],
    cmap="Blues"
)

disp.figure_.suptitle("Multinomial Naive Bayes - Confusion Matrix (for IMDB)")
plt.tight_layout()
plt.savefig("confusion_matrix.png")

print("\nConfusion matrix saved as 'confusion_matrix.png'")

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.85      0.81       199
           1       0.84      0.76      0.80       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.80       400
weighted avg       0.81      0.81      0.80       400


Confusion matrix saved as 'confusion_matrix.png'


In [43]:
# Define the input reviews
sample_reviews = [
    "This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!",
    "This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout.",
    "I hated this film. It was boring, the characters were poorly developed, and the ending was disappointing.",
    "An average movie with some good moments but overall nothing special.",
    "One of the best films I've seen this year. Highly recommend it!"
]

print("\nTesting New Reviews")

# Process each review individually
processed_samples = [preprocess(review) for review in sample_reviews]

# Vectorize the processed reviews
X_new = vectorizer.transform(processed_samples)

# Predict classes and probabilities
predictions = model.predict(X_new)
probabilities = model.predict_proba(X_new)

# Loop through and print results for each review
for i, review in enumerate(sample_reviews):
    print(f"\nReview {i+1}: \"{review}\"")
    
    sentiment = "Positive" if predictions[i] == 1 else "Negative"
    confidence = probabilities[i][1] if predictions[i] == 1 else probabilities[i][0]
    
    print(f"Prediction: {sentiment}")


Testing New Reviews

Review 1: "This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!"
Prediction: Negative

Review 2: "This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout."
Prediction: Positive

Review 3: "I hated this film. It was boring, the characters were poorly developed, and the ending was disappointing."
Prediction: Negative

Review 4: "An average movie with some good moments but overall nothing special."
Prediction: Positive

Review 5: "One of the best films I've seen this year. Highly recommend it!"
Prediction: Positive
