In [32]:
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import defaultdict

In [33]:
with open('rt-polarity.pos', 'r', encoding='latin-1') as f:
    pos_data = f.readlines()

with open('rt-polarity.neg', 'r', encoding='latin-1') as f:
    neg_data = f.readlines()

In [34]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

In [35]:
pos_data = [preprocess_text(sent) for sent in pos_data]
neg_data = [preprocess_text(sent) for sent in neg_data]

train_pos, val_pos = pos_data[:4000], pos_data[4000:4500]
train_neg, val_neg = neg_data[:4000], neg_data[4000:4500]
test_pos, test_neg = pos_data[4500:], neg_data[4500:]

X_train = train_pos + train_neg
y_train = [1] * len(train_pos) + [0] * len(train_neg)

X_val = val_pos + val_neg
y_val = [1] * len(val_pos) + [0] * len(val_neg)

X_test = test_pos + test_neg
y_test = [1] * len(test_pos) + [0] * len(test_neg)

In [36]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.vocab = set() 
        self.word_counts = {0: defaultdict(int), 1: defaultdict(int)}  
        self.class_totals = {0: 0, 1: 0}  
        self.priors = {0: 0, 1: 0} 
        
    def fit(self, X_train, y_train):
        total_examples = len(y_train)
        self.priors[1] = sum(y_train) / total_examples
        self.priors[0] = 1 - self.priors[1]

        for sentence, label in zip(X_train, y_train):
            words = sentence.split()
            self.class_totals[label] += len(words)
            for word in words:
                self.vocab.add(word)
                self.word_counts[label][word] += 1
                
    def predict(self, X_test):
        predictions = []
        for sentence in X_test:
            words = sentence.split()
            log_prob_pos = np.log(self.priors[1])
            log_prob_neg = np.log(self.priors[0])
            
            for word in words:
                log_prob_pos += np.log((self.word_counts[1][word] + 1) / (self.class_totals[1] + len(self.vocab)))
                log_prob_neg += np.log((self.word_counts[0][word] + 1) / (self.class_totals[0] + len(self.vocab)))
                
            if log_prob_pos > log_prob_neg:
                predictions.append(1)
            else:
                predictions.append(0)
        return predictions
    
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

y_pred_val = nb_classifier.predict(X_val)
y_pred_test = nb_classifier.predict(X_test)


In [38]:
from sklearn.metrics import confusion_matrix, f1_score

cm_val = confusion_matrix(y_val, y_pred_val)

TN_val, FP_val, FN_val, TP_val = cm_val.ravel()

f1_val = f1_score(y_val, y_pred_val)

print("Validation Set Performance:")
print(f"TP: {TP_val}, TN: {TN_val}, FP: {FP_val}, FN: {FN_val}")
print(f"F1-Score: {f1_val:.4f}")


cm_test = confusion_matrix(y_test, y_pred_test)

TN_test, FP_test, FN_test, TP_test = cm_test.ravel()

f1_test = f1_score(y_test, y_pred_test)

print("\nTest Set Performance:")
print(f"TP: {TP_test}, TN: {TN_test}, FP: {FP_test}, FN: {FN_test}")
print(f"F1-Score: {f1_test:.4f}")


Validation Set Performance:
TP: 376, TN: 387, FP: 113, FN: 124
F1-Score: 0.7604

Test Set Performance:
TP: 627, TN: 643, FP: 188, FN: 204
F1-Score: 0.7618
