<a href="https://colab.research.google.com/github/lukmanaj/medium-blog-codes/blob/main/imdb_reviews_sent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the movie reviews dataset
nltk.download('movie_reviews')

# Prepare the dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Feature extraction
all_words = nltk.FreqDist(word.lower() for word in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

# Prepare feature sets
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set = featuresets[:1500]
test_set = featuresets[1500:]

# Train the sentiment analysis model (Naive Bayes Classifier)
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the model
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

# Perform sentiment analysis on new data
new_review = "The movie was fantastic! I loved every minute of it."
sentiment = classifier.classify(document_features(new_review.split()))
print("Sentiment:", sentiment)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Accuracy: 0.78
Sentiment: neg


In [2]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Load the movie reviews dataset
nltk.download('movie_reviews')

# Prepare the dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Feature extraction
all_words = nltk.FreqDist(word.lower() for word in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

# Prepare feature sets
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set = featuresets[:1500]
test_set = featuresets[1500:]

# Train the sentiment analysis model (Support Vector Machines)
svm_classifier = SklearnClassifier(SVC())
svm_classifier.train(train_set)

# Train the sentiment analysis model (Random Forest)
rf_classifier = SklearnClassifier(RandomForestClassifier())
rf_classifier.train(train_set)

# Evaluate the models
svm_accuracy = nltk.classify.accuracy(svm_classifier, test_set)
rf_accuracy = nltk.classify.accuracy(rf_classifier, test_set)

print("Support Vector Machines Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

# Perform sentiment analysis on new data
new_review = "The movie was fantastic! I loved every minute of it."

svm_sentiment = svm_classifier.classify(document_features(new_review.split()))
rf_sentiment = rf_classifier.classify(document_features(new_review.split()))

print("Support Vector Machines Sentiment:", svm_sentiment)
print("Random Forest Sentiment:", rf_sentiment)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Support Vector Machines Accuracy: 0.616
Random Forest Accuracy: 0.23
Support Vector Machines Sentiment: neg
Random Forest Sentiment: neg


In [7]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import movie_reviews
nltk.download("vader_lexicon")
from sklearn.metrics import classification_report
# Initialize the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Load the IMDb Movie Reviews dataset
reviews = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        review = movie_reviews.raw(fileid)
        sentiment = 'positive' if category == 'pos' else 'negative'
        reviews.append((review, sentiment))

# Perform sentiment analysis on the movie reviews
sentiments = []
for review, sentiment in reviews:
    sentiment_scores = sid.polarity_scores(review)

    # Extract the compound score, which represents the overall sentiment
    compound_score = sentiment_scores['compound']

    if compound_score >= 0.05:
        predicted_sentiment = "positive"
    elif compound_score <= -0.05:
        predicted_sentiment = "negative"
    else:
        predicted_sentiment = "neutral"

    sentiments.append((review, sentiment, predicted_sentiment))

# Convert sentiments to a DataFrame for easier analysis
df = pd.DataFrame(sentiments, columns=['Review', 'Actual Sentiment', 'Predicted Sentiment'])

# Print a sample of the results
df.head()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Review,Actual Sentiment,Predicted Sentiment
0,"plot : two teen couples go to a church party ,...",negative,positive
1,the happy bastard's quick movie review \ndamn ...,negative,positive
2,it is movies like these that make a jaded movi...,negative,positive
3,""" quest for camelot "" is warner bros . ' firs...",negative,negative
4,synopsis : a mentally unstable man undergoing ...,negative,positive


In [8]:
# Filter out the neutral class from the data
filtered_df = df[df['Predicted Sentiment'] != 'neutral']

# Convert the actual and predicted sentiments to lists
actual_sentiments = filtered_df['Actual Sentiment'].tolist()
predicted_sentiments = filtered_df['Predicted Sentiment'].tolist()

# Generate the classification report
report = classification_report(actual_sentiments, predicted_sentiments)

# Print the classification report
print(report)

              precision    recall  f1-score   support

    negative       0.72      0.44      0.55      1000
    positive       0.60      0.83      0.69       999

    accuracy                           0.64      1999
   macro avg       0.66      0.64      0.62      1999
weighted avg       0.66      0.64      0.62      1999

