In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('IMDB Dataset.csv')

df = df.sample(frac=0.5, random_state=42)

print("Sample Review:", df['review'].iloc[0])
print("Sentiment:", df['sentiment'].iloc[0])

Sample Review: I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozu

In [12]:
# Convert sentiment to binary (0 for negative, 1 for positive)
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Split into features (X) and target (y)
X = df['review']
y = df['sentiment']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
# Train Naive Bayes classifier - MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Train SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

In [16]:
# Make predictions
nb_predictions = nb_classifier.predict(X_test_tfidf)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Evaluate performance
print("Naive Bayes Performance:")
print(classification_report(y_test, nb_predictions, target_names=['negative', 'positive']))
print("Accuracy:", accuracy_score(y_test, nb_predictions))

print("\nSVM Performance:")
print(classification_report(y_test, svm_predictions, target_names=['negative', 'positive']))
print("Accuracy:", accuracy_score(y_test, svm_predictions))

Naive Bayes Performance:
              precision    recall  f1-score   support

    negative       0.85      0.86      0.85      2475
    positive       0.86      0.85      0.86      2525

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

Accuracy: 0.8552

SVM Performance:
              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      2475
    positive       0.87      0.90      0.88      2525

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

Accuracy: 0.882


In [15]:
# Add example input to test the models
example_reviews = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "The plot was boring and the acting was terrible. A complete waste of time.",
    "An average film, not great but not bad either.",
    "I was really moved by the story and the performances.",
    "Worst movie I've seen in a long time. Highly disappointing."
]

# Vectorize the example reviews
example_reviews_tfidf = tfidf_vectorizer.transform(example_reviews)

# Make predictions using both models
nb_example_predictions = nb_classifier.predict(example_reviews_tfidf)
svm_example_predictions = svm_classifier.predict(example_reviews_tfidf)

# Map predictions back to sentiment labels
sentiment_map = {0: 'negative', 1: 'positive'}
nb_example_sentiments = [sentiment_map[pred] for pred in nb_example_predictions]
svm_example_sentiments = [sentiment_map[pred] for pred in svm_example_predictions]

# Print the results
print("\nExample Review Predictions:")
for i, review in enumerate(example_reviews):
    print(f"Review: {review}")
    print(f"Naive Bayes Prediction: {nb_example_sentiments[i]}")
    print(f"SVM Prediction: {svm_example_sentiments[i]}")
    print("-" * 30)


Example Review Predictions:
Review: This movie was absolutely fantastic! I loved every minute of it.
Naïve Bayes Prediction: positive
SVM Prediction: positive
------------------------------
Review: The plot was boring and the acting was terrible. A complete waste of time.
Naïve Bayes Prediction: negative
SVM Prediction: negative
------------------------------
Review: An average film, not great but not bad either.
Naïve Bayes Prediction: negative
SVM Prediction: negative
------------------------------
Review: I was really moved by the story and the performances.
Naïve Bayes Prediction: positive
SVM Prediction: positive
------------------------------
Review: Worst movie I've seen in a long time. Highly disappointing.
Naïve Bayes Prediction: negative
SVM Prediction: negative
------------------------------
