In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
VOCAB_SIZE = 2500

TOKEN_SPAM_PROB_FILE = 'spam-data/token-spam-prob.txt'
TOKEN_HAM_PROB_FILE = 'spam-data/token-ham-prob.txt'
TOKEN_OVERALL_PROB_FILE = 'spam-data/token-overall-prob.txt'

TEST_FEATURE_MATRIX = 'spam-data/test-features.txt'
TEST_TARGET_FILE = 'spam-data/test-target.txt'

## *Load data*

In [3]:
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ') # features
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ') # target

# probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_token_overall = np.loadtxt(TOKEN_OVERALL_PROB_FILE, delimiter=' ')

---
## *Bayes formula:*
## $P(Spam \, | \, X) = \frac{P(X \, | \, Spam) \, P (Spam)}{P(X)}$

In [4]:
# set the prior (prior is a initial 'guess' based on prior knowledge)
PRIOR_PROB_SPAM = 0.3116 # calculated in the training_model notebook
# convert probabilities to log values (simplifies calculation (avoid multiplications and division) and spread the values for better visualizaton)
np.log(prob_token_spam)

array([ -4.43156992,  -5.25657795,  -4.95782974, ..., -11.48212599,
        -9.23083419, -11.07666089])

## *Joint probability (Spam and Ham)*
### *using the dot product (multiply the whole X_test array by the probabilities)*

In [5]:
# spam
print(np.where(prob_token_overall==0)[0]) # index which contains the value 0 (causing problems to apply log)
prob_token_overall[872] = 0.00000000001 # make it nonzero
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_token_overall)) + np.log(PRIOR_PROB_SPAM)
joint_log_spam[:5]

[872]


array([ -7.08377979,  -0.38828262, -53.7297659 ,   6.83221631,
        -2.67213445])

In [6]:
#ham > P(Ham) = 1 - P(Spam)
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_token_overall)) + np.log(1 - PRIOR_PROB_SPAM)
joint_log_ham[:5]

array([ -1.66619644,  -3.01136782,  -5.48219505, -22.98376209,
        -4.63684008])

---