In [3]:

import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import pickle

def NB_XGivenY(XTrain, yTrain, a=0.001, b=0.9):
    """
    Compute the probability of P(X|Y).

    :param
        XTrain: numpy array of size [num_samples, feat_dim]
          where num_samples is the number of samples
          and feat_dim is the dimension of features
        yTrain: numpy array of size [num_samples, 1]
        a: default to 0.001
        b: default to 0.9

    :return: 
        D: numpy array of size [2, vocab_size] where
          vocab_size is the size of vocabulary
    """
    samples, vocab = XTrain.shape
    D = np.zeros((2, vocab))

    for i in range(vocab):
        D[0, i] = (np.sum(XTrain[yTrain[:, 0] == 1, i]) + a) / (np.sum(yTrain == 1) + a + b)
        D[1, i] = (np.sum(XTrain[yTrain[:, 0] == 2, i]) + a) / (np.sum(yTrain == 2) + a + b)
    return D


def NB_YPrior(yTrain):
    """
    Compute the probability of P(Y).

    :param
        yTrain: numpy array of size [num_samples, 1]

    :return: 
        p: a scalar for the probability of P(Y = 1)
    """
    p = np.sum(yTrain == 1) / len(yTrain)
    return p


def NB_Classify(D, p, X):
    """
    Predict the labels of X.

    :param
        D: the probability P(X|Y)
        p: the probability P(Y)
        X: numpy array of size [num_samples, feat_dim]
          where num_samples is the number of samples
          and feat_dim is the dimension of features

    :return: 
        y: numpy array of size [num_samples, 1] where
            num_samples is the number of samples
    """
    num_samples, _ = X.shape
    y = np.zeros((num_samples, 1))

    log_dataset = np.log(D)
    log_dataset_inv = np.log(1 - D)
    log_prior = np.log(p)
    log_prior_inv = np.log(1 - p)

    for i in range(num_samples):
        log_likelihood_1 = np.sum(X[i, :] * log_dataset[0, :]) + np.sum((1 - X[i, :]) * log_dataset_inv[0, :]) + log_prior
        log_likelihood_2 = np.sum(X[i, :] * log_dataset[1, :]) + np.sum((1 - X[i, :]) * log_dataset_inv[1, :]) + log_prior_inv
        y[i, 0] = 1 if log_likelihood_1 > log_likelihood_2 else 2

    return y


def NB_ClassificationAccuracy(yHat, yTruth):
    """
    Compute the accuracy of predictions.

    :param
        yHat: numpy array of size [num_samples, 1]
        yTruth: numpy array of size [num_samples, 1]
    
    :return:
        acc: a scalar for the accuracy
    """
    acc = np.sum(yHat == yTruth) / len(yTruth)
    return acc


In [6]:
from sklearn.metrics import confusion_matrix, classification_report

# Loading data from file
with open('hwdata.pkl', 'rb') as file:
    data = pickle.load(file)

word_vocabulary = data['Vocabulary']
training_features = data['XTrain']
training_labels = data['yTrain']
test_features = data['XTest']
test_labels = data['yTest']

# Training Naive Bayes
likelihood_given_y = NB_XGivenY(training_features, training_labels)
prior_prob_y = NB_YPrior(training_labels)

# Classifying
predicted_labels = NB_Classify(likelihood_given_y, prior_prob_y, test_features)

accuracy = NB_ClassificationAccuracy(predicted_labels, test_labels)
print(f"Test Accuracy : {accuracy * 100:.2f}%")

confusion_mat = confusion_matrix(test_labels, predicted_labels)
print("Confusion Matrix:\n", confusion_mat)

# Classification Report 
class_report = classification_report(test_labels, predicted_labels, target_names=['The Economist', 'The Onion'])
print("Report:", class_report)


log_prob_ratio = np.log(likelihood_given_y[0, :]) - np.log(likelihood_given_y[1, :])

sorted_indices = np.argsort(log_prob_ratio)
# Printing the top 10 words for each class
def print_top_words(class_name, indices, vocabulary):
    print(f"Top 10 words that are most indicative of {class_name}:")
    for i in range(10):
        print(f"{i + 1}. {vocabulary[indices[i]]}")

print_top_words('The Economist', sorted_indices, word_vocabulary)
print_top_words('The Onion', sorted_indices[::-1], word_vocabulary)


Test Accuracy : 97.39%
Confusion Matrix:
 [[103   0]
 [  4  46]]
Report:                precision    recall  f1-score   support

The Economist       0.96      1.00      0.98       103
    The Onion       1.00      0.92      0.96        50

     accuracy                           0.97       153
    macro avg       0.98      0.96      0.97       153
 weighted avg       0.97      0.97      0.97       153

Top 10 words that are most indicative of The Economist:
1. ['4enlarg']
2. ['5enlarg']
3. ['percent']
4. ['realiz']
5. ['center']
6. ['myself']
7. ['approxim']
8. ['honor']
9. ['fuck']
10. ['favor']
Top 10 words that are most indicative of The Onion:
1. ['parliament']
2. ['organis']
3. ['favour']
4. ['labour']
5. ['reckon']
6. ['centr']
7. ['neighbour']
8. ['conserv']
9. ['parliamentari']
10. ['boost']
