Code for Feature_Matrix

In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import pickle

from sklearn.metrics import confusion_matrix, classification_report

def NB_XGivenY(XTrain, yTrain, a=0.001, b=0.9):
    """
    Compute the probability P(X|Y).
    """
    num_samples, vocab_size = XTrain.shape
    D = np.zeros((2, vocab_size))

    for j in range(vocab_size):
        D[0, j] = (np.sum(XTrain[yTrain[:, 0] == 1, j]) + a) / (np.sum(yTrain == 1) + a + b)
        D[1, j] = (np.sum(XTrain[yTrain[:, 0] == 2, j]) + a) / (np.sum(yTrain == 2) + a + b)

    return D

def NB_YPrior(yTrain):
    """
    Compute the probability P(Y).
    """
    p = np.sum(yTrain == 1) / len(yTrain)
    return p

def NB_Classify(D, p, X):
    """
    Predict the labels of X.
    """
    num_samples, _ = X.shape
    y = np.zeros((num_samples, 1))

    log_D = np.log(D)
    log_D_inv = np.log(1 - D)
    log_prior = np.log(p)
    log_prior_inv = np.log(1 - p)

    for i in range(num_samples):
        log_p_y1_given_x = np.sum(X[i, :] * log_D[0, :]) + np.sum((1 - X[i, :]) * log_D_inv[0, :]) + log_prior
        log_p_y2_given_x = np.sum(X[i, :] * log_D[1, :]) + np.sum((1 - X[i, :]) * log_D_inv[1, :]) + log_prior_inv
        y[i, 0] = 1 if log_p_y1_given_x > log_p_y2_given_x else 2

    return y

def NB_ClassificationAccuracy(yHat, yTruth):
    """
    Compute the accuracy of predictions.
    """
    acc = np.sum(yHat == yTruth) / len(yTruth)
    return acc

# Load the data
with open('hwdata.pkl', 'rb') as file:
    data = pickle.load(file)

Vocabulary = data['Vocabulary']
XTrain = data['XTrain']
yTrain = data['yTrain']
XTest = data['XTest']
yTest = data['yTest']

# Train the Naive Bayes classifier
D = NB_XGivenY(XTrain, yTrain)
p = NB_YPrior(yTrain)

# Classify the test data
yHat = NB_Classify(D, p, XTest)

# Calculate and report the accuracy
accuracy = NB_ClassificationAccuracy(yHat, yTest)
print(f"Accuracy on Test Data: {accuracy * 100:.2f}%")

#Confusion matrix
cm = confusion_matrix(yTest, yHat)
print("Confusion Matrix:\n", cm)

#Classification Report (Precise, Reall, F-1 Score)
report = classification_report(yTest, yHat, target_names=['The Economist', 'The Onion'])
print("Classification Report:\n", report)

#Feature Importance Analysis
#Calculate the log probability ratio for each word
log_prob_ratio = np.log(D[0, :]) - np.log(D[1, :])

#Sort words by their log probability ratio
sorted_idx = np.argsort(log_prob_ratio)

#Print the top 10 words
print("Top 10 words that are most indicative of The Economist:")
for i in range(10):
    print(f"{i + 1}. {Vocabulary[sorted_idx[i]]}")

print("Top 10 words that are most indicative of The Onion:")
for i in range(10):
    print(f"{i + 1}. {Vocabulary[sorted_idx[-i - 1]]}")

Accuracy on Test Data: 97.39%
Confusion Matrix:
 [[103   0]
 [  4  46]]
Classification Report:
                precision    recall  f1-score   support

The Economist       0.96      1.00      0.98       103
    The Onion       1.00      0.92      0.96        50

     accuracy                           0.97       153
    macro avg       0.98      0.96      0.97       153
 weighted avg       0.97      0.97      0.97       153

Top 10 words that are most indicative of The Economist:
1. ['4enlarg']
2. ['5enlarg']
3. ['percent']
4. ['realiz']
5. ['center']
6. ['myself']
7. ['approxim']
8. ['honor']
9. ['fuck']
10. ['favor']
Top 10 words that are most indicative of The Onion:
1. ['parliament']
2. ['organis']
3. ['favour']
4. ['labour']
5. ['reckon']
6. ['centr']
7. ['neighbour']
8. ['conserv']
9. ['parliamentari']
10. ['boost']
