In [None]:
import pandas as pd
import numpy as np
from collections import Counter

df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/rt_reviews.csv", encoding='ISO-8859-1')
df.head()

df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_size = int(0.8 * len(df))
dev_size = int(0.1 * len(df))
test_size = len(df) - train_size - dev_size

# Splitting the datasets
train_df = df_shuffled[:train_size]
dev_df = df_shuffled[train_size : train_size + dev_size]
test_df = df_shuffled[train_size + dev_size :]


word_counts = Counter()
for review in train_df['Review']:
    tokens = review.lower().split()
    word_counts.update(tokens)



In [None]:
# build a vocabulary of words
word_count = {}
for review in df_train['Review']:
    words = review.lower().split()
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

vocabulary = []
reverse_index = {}
for i, (word, count) in enumerate(word_count.items()):
    if count >= 5:
        vocabulary.append(word)
        reverse_index[word] = i


In [None]:
# calculate the probability of occurrence and conditional probability of each word
num_documents = len(df_train)
num_positive_documents = len(df_train[df_train['Freshness'] == 'Fresh'])
num_negative_documents = len(df_train[df_train['Freshness'] == 'Rotten'])

word_prob = {}
word_given_positive_prob = {}
word_given_negative_prob = {}

for word in vocabulary:
    # probability of occurrence
    num_documents_containing_word = sum(df_train['Review'].apply(lambda x: word in x.lower().split()))
    word_prob[word] = num_documents_containing_word / num_documents
    
    # conditional probability given positive sentiment
    num_positive_documents_containing_word = sum(df_train[df_train['Freshness'] == 'Fresh']['Review'].apply(lambda x: word in x.lower().split()))
    word_given_positive_prob[word] = num_positive_documents_containing_word / num_positive_documents
    
    # conditional probability given negative sentiment
    num_negative_documents_containing_word = sum(df_train[df_train['Freshness'] == 'Rotten']['Review'].apply(lambda x: word in x.lower().split()))
    word_given_negative_prob[word] = num_negative_documents_containing_word / num_negative_documents


In [None]:
# implement NBC to classify the reviews in the development set
def classify(review):
    log_prob_positive = 0
    log_prob_negative = 0
    
    for word in review.lower().split():
        if word in vocabulary:
            log_prob_positive += np.log(word_given_positive_prob[word])
            log_prob_negative += np.log(word_given_negative_prob[word])
    
    log_prob_positive += np.log(num_positive_documents / num_documents)
    log_prob_negative += np.log(num_negative_documents / num_documents)
    
    if log_prob_positive > log_prob_negative:
        return 'Fresh'
    else:
        return 'Rotten'

df_dev['NBC Prediction'] = df_dev['Review'].apply(classify)


In [None]:
# evaluate the accuracy of the NBC model on the development set
accuracy = sum(df_dev['NBC Prediction'] == df_dev['Freshness']) / len(df_dev) * 100
print(f"Accuracy on development set: {accuracy:.2f}%")


In [None]:
# implement Laplace smoothing to compare the effect of smoothing
def classify_smoothed(review, alpha):
    log_prob_positive = 0
    log_prob_negative = 0
    
    for word in review.lower().split():
        if word in vocabulary:
            log_prob_positive += np.log((word_given_positive_prob[word] * num_positive_documents + alpha) / (num_positive_documents + alpha * len(vocabulary)))
            log_prob_negative += np.log((word_given_negative_prob[word] * num_negative_documents + alpha) / (num_negative_documents + alpha * len(vocabulary)))
    
    log_prob_positive += np.log(num_positive_documents / num_documents)
    log_prob_negative += np.log(num_negative_documents / num_documents)
    
    if log_prob_positive > log_prob_negative:
        return 'Fresh'
    else:
        return 'Rotten'

# evaluate the accuracy of NBC with and without smoothing on the development set
alphas = [0, 0.1, 0.5, 1, 2, 5, 10, 20, 50]
accuracies = []

for alpha in alphas:
    df_dev['NBC Prediction Smoothed'] = df_dev['Review'].apply(lambda x: classify_smoothed(x, alpha))
    accuracy = sum(df_dev['NBC Prediction Smoothed'] == df_dev['Freshness']) / len(df_dev) * 100
    accuracies.append(accuracy)

plt.plot(alphas, accuracies, 'bo-')
plt.xlabel('Alpha')
plt.ylabel('Accuracy')
plt.title('Effect of Laplace smoothing')
plt.show()


In [None]:
# identify the top 10 words that predict each class based on conditional probabilities
top_positive_words = sorted(word_given_positive_prob, key=word_given_positive_prob.get, reverse=True)[:10]
top_negative_words = sorted(word_given_negative_prob, key=word_given_negative_prob.get, reverse=True)[:10]

print(f"Top 10 words that predict positive reviews: {top_positive_words}")
print(f"Top 10 words that predict negative reviews: {top_negative_words}")


In [None]:
# classify the test set and calculate accuracy
correct = 0
for review, label in test_set:
    predicted_label = classify_review(review, vocab, prior_prob, word_given_positive_prob, word_given_negative_prob, alpha=1.0)
    if predicted_label == label:
        correct += 1

test_accuracy = correct / len(test_set)
print(f"Test accuracy: {test_accuracy}")
