# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Constants

In [2]:
VOCAB_SIZE = 2500

TOKEN_SPAM_PROB_FILE = "SpamData/SpamData/03_Testing/prob-spam.txt"
TOKEN_HAM_PROB_FILE = "SpamData/SpamData/03_Testing/prob-nonspam.txt"
TOKEN_ALL_PROB_FILE = "SpamData/SpamData/03_Testing/prob-all-tokens.txt"

TEST_FEATURE_MATRIX = "SpamData/SpamData/03_Testing/test-features.txt"
TEST_TARGET_FILE = "SpamData/SpamData/03_Testing/test-target.txt"

# Load Data

In [3]:
# Features
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=" ")
# Target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=" ")
#Token Probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=" ")
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=" ")
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=" ")

In [5]:
y_test

array([1., 1., 1., ..., 0., 0., 0.])

In [7]:
y_test.shape

(1724,)

In [6]:
X_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [6., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 4., 0., ..., 0., 0., 0.],
       [2., 2., 0., ..., 0., 0., 0.],
       [1., 9., 1., ..., 0., 0., 0.]])

In [8]:
X_test.shape

(1724, 2500)

In [9]:
prob_token_spam

array([1.21855742e-02, 5.22878010e-03, 6.80580255e-03, ...,
       1.11845564e-05, 5.59227818e-06, 1.45399233e-04])

In [10]:
prob_token_spam.shape

(2500,)

In [11]:
prob_token_ham

array([2.14750593e-02, 1.01423056e-02, 8.00811384e-03, ...,
       1.09646547e-04, 9.78987023e-05, 1.17478443e-05])

In [12]:
prob_token_ham.shape

(2500,)

In [13]:
prob_all_tokens

array([1.78501528e-02, 8.20860051e-03, 7.59580972e-03, ...,
       6.52400835e-05, 5.59200716e-05, 6.29100805e-05])

In [14]:
prob_all_tokens.shape

(2500,)

In [15]:
X_test.dot(prob_token_spam)

array([0.43077878, 0.14572918, 0.14635551, ..., 0.57105549, 0.0908913 ,
       0.26089655])

In [16]:
X_test.dot(prob_token_spam).shape

(1724,)

## Set the Prior

$$P(Spam\,|\,X)\,=\,\frac{P(X\,|\,Spam)\,P(Spam)}{P(X)}$$

In [17]:
PROB_SPAM = 0.3116

In [25]:
log_prob_spam = np.log(prob_token_spam)

In [26]:
log_prob_ham = np.log(prob_token_ham)

In [27]:
log_prob_all_tokens = np.log(prob_all_tokens)

### Joint probability in log format

In [28]:
joint_log_spam = X_test.dot(log_prob_spam - log_prob_all_tokens) + np.log(PROB_SPAM)

In [29]:
joint_log_spam

array([  24.27580867,    2.15999942,   20.59075715, ..., -374.67511827,
         -9.9024603 , -112.0276256 ])

$$P(Ham\,|\,X)\,=\,\frac{P(X\,|\,Ham)\,(1\,-\,P(Spam))}{P(X)}$$

In [33]:
joint_log_ham = X_test.dot(log_prob_ham - log_prob_all_tokens) + np.log(1 - PROB_SPAM)

In [34]:
joint_log_ham

array([-6.09682485e+01, -1.10090532e+01, -3.79678354e+01, ...,
        6.09843402e+01, -5.86285468e-02,  2.44574161e+01])