In [None]:
import requests
import tarfile
import io
import pandas as pd
import os

url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens_0211.tar.gz"
response = requests.get(url)

# Extract the tar.gz file
with tarfile.open(fileobj=io.BytesIO(response.content), mode="r:gz") as tar:
    tar.extractall("/tmp/movie_reviews_new")

# Define paths to positive and negative review directories
pos_dir = "/tmp/movie_reviews_new/tokens/pos"
neg_dir = "/tmp/movie_reviews_new/tokens/neg"

# Function to read reviews from a directory
def read_reviews(directory, sentiment):
    reviews = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', encoding='latin-1') as f:
            reviews.append({'review': f.read(), 'sentiment': sentiment})
    return reviews

# Read positive and negative reviews
pos_reviews = read_reviews(pos_dir, 'positive')
neg_reviews = read_reviews(neg_dir, 'negative')

# Combine into a single list
all_reviews = pos_reviews + neg_reviews

# Create a pandas DataFrame
df = pd.DataFrame(all_reviews)

# Display the first few rows of the DataFrame
display(df.head())

  tar.extractall("/tmp/movie_reviews_new")


Unnamed: 0,review,sentiment
0,"release date : january 22 , 1999 starring : em...",positive
1,"hype ? sheesh , like no other . this side of t...",positive
2,lili taylor nabbed one of her first lead roles...,positive
3,""" high fidelity "" ( 2000 ) directed by stephe...",positive
4,elmore leonard has quickly become one of holly...,positive


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize words
    tokens = word_tokenize(text)
    # Remove stopwords (optional, but good for frequency analysis)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to the 'review' column
df['processed_review'] = df['review'].apply(preprocess_text)

# Separate positive and negative reviews
pos_reviews_processed = df[df['sentiment'] == 'positive']['processed_review']
neg_reviews_processed = df[df['sentiment'] == 'negative']['processed_review']

display(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,review,sentiment,processed_review
0,"release date : january 22 , 1999 starring : em...",positive,"[release, date, january, 22, 1999, starring, e..."
1,"hype ? sheesh , like no other . this side of t...",positive,"[hype, sheesh, like, side, titanic, good, hunt..."
2,lili taylor nabbed one of her first lead roles...,positive,"[lili, taylor, nabbed, one, first, lead, roles..."
3,""" high fidelity "" ( 2000 ) directed by stephe...",positive,"[high, fidelity, 2000, directed, stephen, frea..."
4,elmore leonard has quickly become one of holly...,positive,"[elmore, leonard, quickly, become, one, hollyw..."


In [None]:
from collections import Counter

# Count word frequencies for positive reviews
pos_word_counts = Counter([word for review in pos_reviews_processed for word in review])

# Count word frequencies for negative reviews
neg_word_counts = Counter([word for review in neg_reviews_processed for word in review])

display("Positive Review Word Counts:")
display(pos_word_counts.most_common(10))

display("Negative Review Word Counts:")
display(neg_word_counts.most_common(10))

'Positive Review Word Counts:'

[('film', 3274),
 ('one', 1970),
 ('movie', 1797),
 ('like', 1234),
 ('time', 897),
 ('also', 872),
 ('even', 835),
 ('good', 829),
 ('films', 825),
 ('story', 819)]

'Negative Review Word Counts:'

[('film', 2797),
 ('movie', 2103),
 ('one', 1738),
 ('like', 1242),
 ('even', 937),
 ('time', 806),
 ('good', 760),
 ('get', 685),
 ('would', 674),
 ('films', 647)]

In [None]:
# Filter positive word counts for frequency 4 or more
pos_filtered_counts = {word: count for word, count in pos_word_counts.items() if count >= 4}

# Filter negative word counts for frequency 4 or more
neg_filtered_counts = {word: count for word, count in neg_word_counts.items() if count >= 4}

display("Positive Reviews (Frequency >= 4):")
display(pos_filtered_counts)

display("Negative Reviews (Frequency >= 4):")
display(neg_filtered_counts)

'Positive Reviews (Frequency >= 4):'

{'release': 100,
 'date': 63,
 'january': 17,
 '22': 8,
 '1999': 59,
 'starring': 204,
 'emily': 15,
 'watson': 18,
 'rachel': 17,
 'griffiths': 19,
 'david': 234,
 'james': 197,
 'charles': 52,
 'dance': 27,
 'directed': 262,
 'anand': 4,
 'tucker': 29,
 'distributed': 13,
 'october': 28,
 'films': 825,
 'mpaa': 41,
 'rating': 85,
 'r': 152,
 'language': 105,
 'sexuality': 28,
 'url': 14,
 'http': 272,
 'www': 210,
 'execpc': 11,
 'htm': 15,
 'since': 307,
 '1996s': 7,
 'shine': 18,
 'starred': 13,
 'geoffrey': 19,
 'rush': 38,
 'movie': 1797,
 'laid': 13,
 'bare': 10,
 'lives': 140,
 'musicians': 15,
 'fact': 270,
 'one': 1970,
 'watch': 210,
 'together': 202,
 'would': 646,
 'almost': 295,
 'certainly': 141,
 'stand': 45,
 'convincing': 54,
 'argument': 12,
 'life': 607,
 'music': 193,
 'naturally': 27,
 'leads': 90,
 'social': 52,
 'yet': 251,
 'hilary': 17,
 'jackie': 162,
 'sophomore': 4,
 'effort': 51,
 'director': 409,
 'something': 369,
 'say': 293,
 'strongly': 15,
 'explores

'Negative Reviews (Frequency >= 4):'

{'little': 531,
 'expensive': 13,
 'flashy': 12,
 'painfully': 42,
 'drawnout': 5,
 'season': 18,
 'finale': 40,
 'x': 10,
 'files': 5,
 'fight': 80,
 'future': 43,
 'chance': 69,
 'draw': 17,
 'new': 378,
 'fans': 62,
 'tv': 126,
 'show': 211,
 'best': 332,
 'done': 131,
 'movie': 2103,
 'opens': 54,
 'agents': 15,
 'duchovny': 5,
 'anderson': 30,
 'finding': 24,
 'assignment': 10,
 'closing': 12,
 'xfiles': 8,
 'course': 250,
 'bizarre': 33,
 'coincidence': 12,
 'first': 564,
 'leads': 56,
 'uncover': 4,
 'conspiracy': 26,
 'involving': 82,
 'hiding': 13,
 'bodies': 20,
 'wouldbe': 20,
 'aliens': 43,
 'plot': 569,
 'turns': 143,
 'alternately': 6,
 'confusing': 34,
 'ridiculous': 71,
 'surprising': 19,
 'leaves': 54,
 'much': 644,
 'imagination': 23,
 'feature': 78,
 'film': 2797,
 'version': 87,
 'two': 609,
 'hours': 84,
 'really': 525,
 'pretty': 233,
 'boring': 155,
 'found': 140,
 'falling': 30,
 'asleep': 23,
 'especially': 105,
 'dull': 95,
 'half': 171,
 'later': 131,
 'one':

In [None]:
print(f"Count of positive words with frequency >= 4: {len(pos_filtered_counts)}")
print(f"Count of negative words with frequency >= 4: {len(neg_filtered_counts)}")

Count of positive words with frequency >= 4: 9848
Count of negative words with frequency >= 4: 8998


In [None]:
# Calculate the total number of reviews
total_reviews = len(pos_reviews_processed) + len(neg_reviews_processed)

# Calculate the prior probability of positive reviews
prior_pos = len(pos_reviews_processed) / total_reviews

# Calculate the prior probability of negative reviews
prior_neg = len(neg_reviews_processed) / total_reviews

display(f"Prior probability of positive reviews: {prior_pos}")
display(f"Prior probability of negative reviews: {prior_neg}")

'Prior probability of positive reviews: 0.5007215007215007'

'Prior probability of negative reviews: 0.49927849927849927'

In [None]:
# Calculate the total number of unique words across both filtered word counts
all_filtered_words = set(pos_filtered_counts.keys()).union(set(neg_filtered_counts.keys()))
vocabulary_size = len(all_filtered_words)
# Calculate total words in each class (including words with frequency < 4 for smoothing)
total_words_pos = sum(pos_word_counts.values())
total_words_neg = sum(neg_word_counts.values())

# Calculate conditional probabilities for positive reviews with add-one smoothing
pos_conditional_probs = {}
for word, count in pos_filtered_counts.items():
    pos_conditional_probs[word] = (count + 1) / (total_words_pos + vocabulary_size)

# Calculate conditional probabilities for negative reviews with add-one smoothing
neg_conditional_probs = {}
for word, count in neg_filtered_counts.items():
    neg_conditional_probs[word] = (count + 1) / (total_words_neg + vocabulary_size)

# Display the first 10 key-value pairs of the conditional probabilities
display("Positive Review Conditional Probabilities (first 10):")
display(list(pos_conditional_probs.items())[:10])

display("Negative Review Conditional Probabilities (first 10):")
display(list(neg_conditional_probs.items())[:10])

'Positive Review Conditional Probabilities (first 10):'

[('release', 0.0003618709804553842),
 ('date', 0.0002293043836548969),
 ('january', 6.449185790293976e-05),
 ('22', 3.224592895146988e-05),
 ('1999', 0.00021497285967646585),
 ('starring', 0.0007344906038945916),
 ('emily', 5.732609591372423e-05),
 ('watson', 6.807473889754752e-05),
 ('rachel', 6.449185790293976e-05),
 ('griffiths', 7.165761989215529e-05)]

'Negative Review Conditional Probabilities (first 10):'

[('little', 0.002122168281562267),
 ('expensive', 5.5846533725322815e-05),
 ('flashy', 5.185749560208547e-05),
 ('painfully', 0.00017152863929920578),
 ('drawnout', 2.3934228739424064e-05),
 ('season', 7.579172434150954e-05),
 ('finale', 0.00016355056305273109),
 ('x', 4.387941935561078e-05),
 ('files', 2.3934228739424064e-05),
 ('fight', 0.0003231120879822248)]

In [None]:
import math

def naive_bayes_classify(review_tokens):
    # Initialize log probabilities with the log of prior probabilities
    log_prob_pos = math.log(prior_pos)
    log_prob_neg = math.log(prior_neg)

    # Calculate smoothed probabilities for words not in the vocabulary
    smoothed_prob_pos = 1 / (total_words_pos + vocabulary_size)
    smoothed_prob_neg = 1 / (total_words_neg + vocabulary_size)


    # Iterate through each token in the review
    for token in review_tokens:
        # Add log probability for positive sentiment
        if token in pos_conditional_probs:
            log_prob_pos += math.log(pos_conditional_probs[token])
        else:
            log_prob_pos += math.log(smoothed_prob_pos)

        # Add log probability for negative sentiment
        if token in neg_conditional_probs:
            log_prob_neg += math.log(neg_conditional_probs[token])
        else:
            log_prob_neg += math.log(smoothed_prob_neg)

    # Compare log probabilities and return the predicted sentiment
    if log_prob_pos >= log_prob_neg:
        return 'positive'
    else:
        return 'negative'


In [None]:
df['predicted_sentiment'] = df['processed_review'].apply(naive_bayes_classify)
display(df.head())

Unnamed: 0,review,sentiment,processed_review,predicted_sentiment
0,"release date : january 22 , 1999 starring : em...",positive,"[release, date, january, 22, 1999, starring, e...",positive
1,"hype ? sheesh , like no other . this side of t...",positive,"[hype, sheesh, like, side, titanic, good, hunt...",positive
2,lili taylor nabbed one of her first lead roles...,positive,"[lili, taylor, nabbed, one, first, lead, roles...",negative
3,""" high fidelity "" ( 2000 ) directed by stephe...",positive,"[high, fidelity, 2000, directed, stephen, frea...",positive
4,elmore leonard has quickly become one of holly...,positive,"[elmore, leonard, quickly, become, one, hollyw...",positive


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(df['sentiment'], df['predicted_sentiment'])

# Calculate precision
precision = precision_score(df['sentiment'], df['predicted_sentiment'], pos_label='positive')

# Calculate recall
recall = recall_score(df['sentiment'], df['predicted_sentiment'], pos_label='positive')

# Calculate F1-score
f1 = f1_score(df['sentiment'], df['predicted_sentiment'], pos_label='positive')

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9726
Precision: 0.9838
Recall: 0.9611
F1-score: 0.9723


In [None]:
%pip install scikit-learn nltk



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a list of strings from the processed_review column
processed_reviews_text = [" ".join(review) for review in df['processed_review']]

# Instantiate TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed reviews
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_reviews_text)

# Display the shape of the TF-IDF matrix
print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}")

Shape of TF-IDF matrix: (1386, 40876)


In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate the LogisticRegression model
maxent_model = LogisticRegression()

# Train the model
maxent_model.fit(tfidf_matrix, df['sentiment'])

In [None]:
# Use the trained MaxEnt model to predict the sentiment
df['predicted_sentiment_maxent'] = maxent_model.predict(tfidf_matrix)

# Display the head of the DataFrame to show the new column
display(df.head())

Unnamed: 0,review,sentiment,processed_review,predicted_sentiment,predicted_sentiment_maxent
0,"release date : january 22 , 1999 starring : em...",positive,"[release, date, january, 22, 1999, starring, e...",positive,positive
1,"hype ? sheesh , like no other . this side of t...",positive,"[hype, sheesh, like, side, titanic, good, hunt...",positive,positive
2,lili taylor nabbed one of her first lead roles...,positive,"[lili, taylor, nabbed, one, first, lead, roles...",negative,positive
3,""" high fidelity "" ( 2000 ) directed by stephe...",positive,"[high, fidelity, 2000, directed, stephen, frea...",positive,positive
4,elmore leonard has quickly become one of holly...,positive,"[elmore, leonard, quickly, become, one, hollyw...",positive,positive


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy for MaxEnt
accuracy_maxent = accuracy_score(df['sentiment'], df['predicted_sentiment_maxent'])

# Calculate precision for MaxEnt
precision_maxent = precision_score(df['sentiment'], df['predicted_sentiment_maxent'], pos_label='positive')

# Calculate recall for MaxEnt
recall_maxent = recall_score(df['sentiment'], df['predicted_sentiment_maxent'], pos_label='positive')

# Calculate F1-score for MaxEnt
f1_maxent = f1_score(df['sentiment'], df['predicted_sentiment_maxent'], pos_label='positive')

# Print the results for MaxEnt
print(f"MaxEnt Classifier Performance:")
print(f"Accuracy: {accuracy_maxent:.4f}")
print(f"Precision: {precision_maxent:.4f}")
print(f"Recall: {recall_maxent:.4f}")
print(f"F1-score: {f1_maxent:.4f}")

MaxEnt Classifier Performance:
Accuracy: 0.9863
Precision: 0.9842
Recall: 0.9885
F1-score: 0.9863


In [None]:
print("Naive Bayes Classifier Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nMaximum Entropy Classifier Performance:")
print(f"Accuracy: {accuracy_maxent:.4f}")
print(f"Precision: {precision_maxent:.4f}")
print(f"Recall: {recall_maxent:.4f}")
print(f"F1-score: {f1_maxent:.4f}")

print("\nComparison:")
print(f"Accuracy: MaxEnt ({accuracy_maxent:.4f}) vs Naive Bayes ({accuracy:.4f}) - MaxEnt is better")
print(f"Precision: MaxEnt ({precision_maxent:.4f}) vs Naive Bayes ({precision:.4f}) - MaxEnt is better")
print(f"Recall: MaxEnt ({recall_maxent:.4f}) vs Naive Bayes ({recall:.4f}) - MaxEnt is better")
print(f"F1-score: MaxEnt ({f1_maxent:.4f}) vs Naive Bayes ({f1:.4f}) - MaxEnt is better")

Naive Bayes Classifier Performance:
Accuracy: 0.9726
Precision: 0.9838
Recall: 0.9611
F1-score: 0.9723

Maximum Entropy Classifier Performance:
Accuracy: 0.9863
Precision: 0.9842
Recall: 0.9885
F1-score: 0.9863

Comparison:
Accuracy: MaxEnt (0.9863) vs Naive Bayes (0.9726) - MaxEnt is better
Precision: MaxEnt (0.9842) vs Naive Bayes (0.9838) - MaxEnt is better
Recall: MaxEnt (0.9885) vs Naive Bayes (0.9611) - MaxEnt is better
F1-score: MaxEnt (0.9863) vs Naive Bayes (0.9723) - MaxEnt is better


In [None]:
%pip install scikit-learn



In [None]:
from sklearn.svm import SVC

# Instantiate the SVC model
svm_model = SVC()

# Train the model
svm_model.fit(tfidf_matrix, df['sentiment'])

In [None]:
# Use the trained SVM model to predict the sentiment
df['predicted_sentiment_svm'] = svm_model.predict(tfidf_matrix)

# Display the head of the DataFrame to show the new column
display(df.head())

Unnamed: 0,review,sentiment,processed_review,predicted_sentiment,predicted_sentiment_maxent,predicted_sentiment_svm
0,"release date : january 22 , 1999 starring : em...",positive,"[release, date, january, 22, 1999, starring, e...",positive,positive,positive
1,"hype ? sheesh , like no other . this side of t...",positive,"[hype, sheesh, like, side, titanic, good, hunt...",positive,positive,positive
2,lili taylor nabbed one of her first lead roles...,positive,"[lili, taylor, nabbed, one, first, lead, roles...",negative,positive,positive
3,""" high fidelity "" ( 2000 ) directed by stephe...",positive,"[high, fidelity, 2000, directed, stephen, frea...",positive,positive,positive
4,elmore leonard has quickly become one of holly...,positive,"[elmore, leonard, quickly, become, one, hollyw...",positive,positive,positive
