In [5]:
#import the required libraries
import pandas as pd
import numpy as np
import string
import re

### Step 1: Load and preprocess the dataset

First, let's load the dataset and preprocess it by removing stop words, punctuation, and other unwanted characters:

In [6]:
#fetch and load the dataset
dataFrame = pd.read_csv('rt_reviews.csv', encoding='ISO-8859-1')
#view the dataset
dataFrame.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [7]:
# Remove unwanted characters
dataFrame["Review"] = dataFrame["Review"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", "", x))

In [8]:
# Remove stop words
stop_words = ["a", "an", "the", "and", "or", "but", "is", "am", "are", "was", "were", "be", "being", "been", "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "can", "could", "may", "might", "must"]
dataFrame["Review"] = dataFrame["Review"].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in stop_words]))

In [9]:
# Remove punctuation
dataFrame["Review"] = dataFrame["Review"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))

### Step 2: Divide the dataset into train, development, and test sets
Next, let's divide the preprocessed dataset into train, development, and test sets:

In [10]:
# Shuffle the dataset
dataFrame = dataFrame.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
# Divide the dataset into train, development, and test sets
train_size = int(0.6 * len(dataFrame))
dev_size = int(0.2 * len(dataFrame))
test_size = len(dataFrame) - train_size - dev_size

In [12]:
train_df = dataFrame[:train_size]
dev_df = dataFrame[train_size:train_size+dev_size]
test_df = dataFrame[train_size+dev_size:]

# Print the sizes of the datasets
print("Train size:", len(train_df))
print("Development size:", len(dev_df))
print("Test size:", len(test_df))

Train size: 288000
Development size: 96000
Test size: 96000


### Step 3: Build a vocabulary and reverse index
Now, let's build a vocabulary and a reverse index using the train dataset:

In [13]:
# Build the vocabulary
word_counts = {}
for review in train_df["Review"]:
    for word in review.split():
        word_counts[word] = word_counts.get(word, 0) + 1

vocab = [word for word, count in word_counts.items() if count >= 5]

# Build the reverse index
word2index = {word: i for i, word in enumerate(vocab)}

# Print the size of the vocabulary
print("Vocabulary size:", len(vocab))

Vocabulary size: 41167


### Step 4: Calculate probabilities

Now, let's calculate the probabilities using the train dataset:

In [14]:
# Calculate the prior probabilities
num_fresh = sum(train_df["Freshness"] == "fresh")
num_rotten = sum(train_df["Freshness"] == "rotten")
total = len(train_df)

prior_fresh = num_fresh / total
prior_rotten = num_rotten / total

In [15]:
# Calculate the conditional probabilities
word_counts_fresh = np.zeros(len(vocab))
word_counts_rotten = np.zeros(len(vocab))

for review, freshness in zip(train_df["Review"], train_df["Freshness"]):
    for word in review.split():
        if word in word2index:
            i = word2index[word]
            if freshness == "fresh":
                word_counts_fresh[i] += 1
            else:
                word_counts_rotten[i] += 1

In [16]:
# Add smoothing to the word counts
alpha = 1
word_probs_fresh = (word_counts_fresh + alpha) / (num_fresh + alpha * len(vocab))
word_probs_rotten = (word_counts_rotten + alpha) / (num_rotten + alpha * len(vocab))

In [17]:
# Calculate the probabilities without smoothing
word_probs_fresh_no_smooth = word_counts_fresh / num_fresh
word_probs_rotten_no_smooth = word_counts_rotten / num_rotten

In [18]:
# Print the probabilities of the first 10 words in the vocabulary
for i in range(10):
    word = vocab[i]
    print("\n"+word+':')
    print("\tP(word|fresh):", word_probs_fresh[i])
    print("\tP(word|rotten):", word_probs_rotten[i])


Guardians:
	P(word|fresh): 0.000475359625762331
	P(word|rotten): 0.00016197742034760354

of:
	P(word|fresh): 0.5311117473247516
	P(word|rotten): 0.46476721145072375

Galaxy:
	P(word|fresh): 0.00034031427753439605
	P(word|rotten): 9.718645220856213e-05

firstclass:
	P(word|fresh): 0.0001890634875191089
	P(word|rotten): 4.3193978759360944e-05

GradeA:
	P(word|fresh): 3.2410883574704386e-05
	P(word|rotten): 1.6197742034760356e-05

space:
	P(word|fresh): 0.002198538269150781
	P(word|rotten): 0.00183034484992792

adventure:
	P(word|fresh): 0.00591498625238355
	P(word|rotten): 0.00274821689856434

comedy:
	P(word|fresh): 0.023443872452369504
	P(word|rotten): 0.02603517069720481

For:
	P(word|fresh): 0.008253971683691384
	P(word|rotten): 0.00917332123901928

while:
	P(word|fresh): 0.01426078877286993
	P(word|rotten): 0.011581385554853654


### Step 5: Calculate accuracy on development set

Next, let's calculate the accuracy on the development set:

In [19]:
# Define a function to naive bayes classify a review
def classify(review, smoothing=True):
    if smoothing:
        fresh_prob = prior_fresh + len(word_counts_fresh)
        rotten_prob = prior_rotten + len(word_counts_rotten)
    else:
        fresh_prob = prior_fresh
        rotten_prob = prior_rotten
    
    for word in review.split():
        if word in word2index:
            i = word2index[word]
            if smoothing:
                fresh_prob *= (word_counts_fresh[i] + 1) / (num_fresh + len(word_counts_fresh))
                rotten_prob *= (word_counts_rotten[i] + 1) / (num_rotten + len(word_counts_rotten))
            else:
                fresh_prob *= word_probs_fresh[i]
                rotten_prob *= word_probs_rotten[i]
    
    if fresh_prob > rotten_prob:
        return "fresh"
    else:
        return "rotten"

In [20]:
# Classify the reviews in the development set
dev_df.loc[:,"Prediction"] = dev_df["Review"].apply(classify)

# Calculate the accuracy on the development set
dev_accuracy = sum(dev_df["Prediction"] == dev_df["Freshness"]) / len(dev_df)
print("Development accuracy:", dev_accuracy)

Development accuracy: 0.7963854166666666


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_df.loc[:,"Prediction"] = dev_df["Review"].apply(classify)


### Step 6a: Compare the effect of smoothing

Now, let's compare the effect of smoothing by calculating the accuracies with and without smoothing:

In [21]:
# Classify the reviews in the development set without smoothing
dev_df.loc[:, "Prediction_no_smooth"] = dev_df["Review"].apply(lambda x: classify(x, False))

# Calculate the accuracy on the development set without smoothing
dev_accuracy_no_smooth = sum(dev_df["Prediction_no_smooth"] == dev_df["Freshness"]) / len(dev_df)
print("Development accuracy without smoothing:", dev_accuracy_no_smooth)

Development accuracy without smoothing: 0.796375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_df.loc[:, "Prediction_no_smooth"] = dev_df["Review"].apply(lambda x: classify(x, False))


### Step 6b: Derive top 10 words that predict each class

Next, let's derive the top 10 words that predict each class:

In [22]:
# Derive the top 10 words that predict freshness
fresh_probs = word_probs_fresh / word_probs_rotten
fresh_top_10 = [vocab[i] for i in np.argsort(fresh_probs)[-10:]]
print("Top 10 words that predict fresh:", fresh_top_10)

# Derive the top 10 words that predict rotten
rotten_probs = word_probs_rotten / word_probs_fresh
rotten_top_10 = [vocab[i] for i in np.argsort(rotten_probs)[-10:]]
print("Top 10 words that predict rotten:", rotten_top_10)

Top 10 words that predict fresh: ['unadorned', 'SpiderVerse', 'Holofcener', 'unmissable', 'restores', 'Brilliantly', 'Impressively', 'razorsharp', 'Fallout', 'Gripping']
Top 10 words that predict rotten: ['thirdrate', 'Tedious', 'Battlefield', 'cheaplooking', 'unexciting', 'laziest', 'flavorless', 'unfunny', 'feeble', 'charmless']


### Step 6c: P[class | word]

Finally, let's calculate P[class | word] for a given word:

In [23]:
# Define a function to calculate P[class | word]
def p_class_given_word(word):
    if word in word2index:
        i = word2index[word]
        p_word_fresh = word_probs_fresh[i]
        p_word_rotten = word_probs_rotten[i]
        p_fresh_word = p_rotten_word = word_probs_rotten[i]
        return {"fresh": p_word_fresh / (p_word_fresh + p_word_rotten), 
                "rotten": p_word_rotten / (p_word_fresh + p_word_rotten)}
    else:
        return {"fresh": 0, "rotten": 0}

# Calculate P[class | word] for the word "good"
p_class_given_word("good")

{'fresh': 0.5192850636876635, 'rotten': 0.4807149363123365}

### Step 7: Calculate final accuracy on test set

Finally, let's calculate the final accuracy on the test set using the optimal hyperparameters we found on the development set:

In [24]:
# Classify the reviews in the test set
test_df.loc[:, "Prediction"] = test_df["Review"].apply(classify)

# Calculate the accuracy on the test set
test_accuracy = sum(test_df["Prediction"] == test_df["Freshness"]) / len(test_df)
print("Test accuracy:", test_accuracy)

Test accuracy: 0.79928125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, "Prediction"] = test_df["Review"].apply(classify)
