# Naive Bayes Classifier

In [1]:
from scipy.optimize import minimize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import os
import sys
from collections import Counter
import string

In [2]:
# get all the files organized
ham_dir = ["./enron"+str(i)+"/ham" for i in range(1,6)]
spam_dir = ["./enron"+str(i)+"/spam" for i in range(1,6)]

ham_files = []
spam_files = []

for directory in ham_dir:
    ham_files += [directory +"/"+x for x in os.listdir(directory)]
for directory in spam_dir:
    spam_files += [directory +"/"+x for x in os.listdir(directory)]
    
ham_test_files = ["./enron6/ham/"+x for x in os.listdir("./enron6/ham/")]
spam_test_files = ["./enron6/spam/"+x for x in os.listdir("./enron6/spam/")]

all_files = ham_files + spam_files + ham_test_files + spam_test_files
len(all_files)

33713

In [5]:
#first we need to extract the vocabulary of everything in the training and test set. 
def get_word_list(fileset):
    all_text = ""
    for f in fileset:
        with open(f, 'r') as myfile:
            data=myfile.read().replace('\n', ' ').replace('\r', ' ')
            #print data, "\n\n"
            all_text += " " + data
    return all_text.split()

vocabulary = list(set(get_word_list(all_files))) 

In [6]:
len(vocabulary)

159212

In [7]:
ham_prior = len(ham_files)/float(len(ham_files) + len(spam_files))
spam_prior = 1 - ham_prior

Find P(test_example|spam) = P(word 1 in test_example|spam)P(word 2 in test_example|spam)...P(word n in test_example|spam) but all in log scale and also without laplace smoothing. 

In [8]:
ham_counts = Counter(get_word_list(ham_files))

In [10]:
def get_log_probabilities(f, counts, laplace, len_vocabulary):
    with open(f, 'r') as myfile:
        data=myfile.read().replace('\n', ' ').replace('\r', ' ')
        words = data.split()
        #print words
        s = 0
        n = float(sum(counts.values()))
        if laplace:
            for word in words:
                s += np.log((counts[word] + 1)/float(n + len_vocabulary))
        else:
            for word in words:
                if counts[word] > 0:
                    s += np.log(counts[word]/n)
                else:
                    s = -np.inf
                    
        return s
    
    
c = Counter(["Ch", "Be", "Ch", "Ch", "Ch", "Sh", "Ch", "Ma"])
c1 = Counter(["To", "Ja", "Ch"])
print get_log_probabilities("./toytest/ham/5.txt", c, laplace = True, len_vocabulary=6) + np.log(.75)
print get_log_probabilities("./toytest/ham/5.txt", c1, laplace = True, len_vocabulary=6) + np.log(.25)

-8.10769031284
-8.906681345


In [11]:
ham_counts = Counter(get_word_list(ham_files))
spam_counts = Counter(get_word_list(spam_files))

In [12]:
# so now implement the above. 
############# HAM ##################
l = len(vocabulary)
n_correct = 0
c = 0
for f in ham_test_files:
    #ham log probabilities
    p_ham = np.log(ham_prior) + get_log_probabilities(f, ham_counts, laplace = False, len_vocabulary = l)
    p_spam = np.log(spam_prior) + get_log_probabilities(f, spam_counts, laplace = False, len_vocabulary = l)
    #print p_ham, p_spam
    if p_ham > p_spam:
        n_correct += 1
    elif p_ham == p_spam:
        n_correct += np.random.binomial(1, ham_prior, size =1)
    c += 1 
    #print c

    
print n_correct, "/", len(ham_test_files)    

[971] / 1500


In [13]:
# so now implement the above. 
############# SPAM ##################
l = len(vocabulary)
n_correct = 0
c = 0
for f in spam_test_files:
    #ham log probabilities
    p_ham = np.log(ham_prior) + get_log_probabilities(f, ham_counts, laplace = False, len_vocabulary = l)
    p_spam = np.log(spam_prior) + get_log_probabilities(f, spam_counts, laplace = False, len_vocabulary = l)
    #print p_ham, p_spam
    if p_spam > p_ham:
        n_correct += 1
    elif p_ham == p_spam:
        n_correct += np.random.binomial(1, ham_prior, size =1)
    c += 1 
    #print c

print n_correct, "/", len(spam_test_files)    

[3209] / 4499


In [15]:
# figure out the discriminative words.
def get_log_probabilities2(f, counts, len_vocabulary):
    with open(f, 'r') as myfile:
        data=myfile.read().replace('\n', ' ').replace('\r', ' ')
        words = data.split()
        #print words
        dct = {}
        n = float(sum(counts.values()))
        for word in words:
            dct[word] = np.log((counts[word] + 1)/float(n + len_vocabulary))
            
        return dct

c = Counter(["Ch", "Be", "Ch", "Ch", "Ch", "Sh", "Ch", "Ma"])
c1 = Counter(["To", "Ja", "Ch"])
get_log_probabilities2("./toytest/ham/5.txt", c, len_vocabulary=6)

{'Ch': -0.84729786038720367,
 'Ja': -2.6390573296152589,
 'To': -2.6390573296152589}

In [16]:
#SPAM FILES
l = len(vocabulary)
n_correct = 0
c = 0
dct = {}
for f in spam_test_files:
    d = get_log_probabilities2(f, spam_counts, len_vocabulary=l)
    dct.update(d)

In [17]:
spam_word_relevancies = pd.DataFrame.from_dict(dct, orient = "index")
spam_word_relevancies.columns = ["relevancy"]
spam_word_relevancies.sort_values(by = "relevancy", ascending = False, inplace=True) 
spam_word_relevancies

Unnamed: 0,relevancy
.,-2.942806
",",-3.463570
-,-3.798922
the,-3.821543
to,-4.041874
_,-4.130527
and,-4.188272
of,-4.274561
:,-4.450663
a,-4.496027


In [18]:
#HAM FILES
l = len(vocabulary)
n_correct = 0
c = 0
dct = {}
for f in ham_test_files:
    d = get_log_probabilities2(f, ham_counts, len_vocabulary=l)
    dct.update(d)

In [19]:
ham_word_relevancies = pd.DataFrame.from_dict(dct, orient = "index")
ham_word_relevancies.columns = ["relevancy"]
ham_word_relevancies.sort_values(by = "relevancy", ascending = False, inplace=True) 
ham_word_relevancies

Unnamed: 0,relevancy
-,-2.975587
.,-3.084733
",",-3.294362
the,-3.530385
to,-3.865963
/,-3.876606
:,-3.967353
and,-4.313026
of,-4.389943
a,-4.542910


In [20]:
relevancies = ham_word_relevancies.merge(spam_word_relevancies, how='inner', left_index=True, right_index=True)
relevancies["difference"] = abs(relevancies["relevancy_x"] - relevancies["relevancy_y"])
relevancies.sort_values(by= "difference", ascending= False)

Unnamed: 0,relevancy_x,relevancy_y,difference
enron,-4.631866,-15.043340,10.411474
ect,-5.083544,-12.645445,7.561901
,-7.811059,-15.043340,7.232281
xls,-8.550871,-15.043340,6.492469
eol,-8.742885,-15.043340,6.300456
hpl,-7.692071,-13.944728,6.252657
louise,-7.348902,-13.433902,6.085000
sex,-14.454415,-8.516846,5.937569
713,-7.471552,-13.251581,5.780029
vince,-6.492115,-12.152969,5.660853


In [21]:
# it may be advisable to write a table for every word in vocabulary. 
def vocab_log_probabilites(ham_counts, spam_counts, vocabulary):
    len_vocabulary = len(vocabulary)
    spam_dct = {}
    ham_dct = {}
    n_ham = float(sum(ham_counts.values()))
    n_spam = float(sum(spam_counts.values()))
    for v in vocabulary:
        ham_dct[v] = np.log((ham_counts[v] + 1)/float(n_ham + len_vocabulary))
        spam_dct[v] = np.log((spam_counts[v] + 1)/float(n_spam + len_vocabulary))
    
    ham_word_relevancies = pd.DataFrame.from_dict(ham_dct, orient = "index")
    ham_word_relevancies.columns = ["ham_relevancy"]
    spam_word_relevancies = pd.DataFrame.from_dict(spam_dct, orient = "index")
    spam_word_relevancies.columns = ["spam_relevancy"]
    relevancies = ham_word_relevancies.merge(spam_word_relevancies, how='outer', left_index=True, right_index=True)
    relevancies["difference"] = abs(relevancies["ham_relevancy"] - relevancies["spam_relevancy"])
    relevancies.sort_values(by= "difference", ascending= False, inplace=True)
    return relevancies
    

In [22]:
vlp = vocab_log_probabilites(ham_counts, spam_counts, vocabulary)
vlp

Unnamed: 0,ham_relevancy,spam_relevancy,difference
enron,-4.631866,-15.043340,10.411474
kaminski,-7.074783,-15.043340,7.968558
dynegy,-7.084814,-15.043340,7.958526
pills,-15.553027,-7.654394,7.898633
viagra,-15.553027,-7.742868,7.810160
ect,-5.083544,-12.645445,7.561901
computron,-15.553027,-8.069797,7.483230
cialis,-15.553027,-8.288736,7.264291
ees,-7.810625,-15.043340,7.232715
,-7.811059,-15.043340,7.232281


In [181]:
vlp.loc[vlp.index == "this", ]

Unnamed: 0,ham_relevancy,spam_relevancy,difference
this,-5.373348,-4.963885,0.409463


In [180]:
vlp.loc[vlp.index == "that", ]

Unnamed: 0,ham_relevancy,spam_relevancy,difference
that,-5.137525,-5.4534,0.315875


In [23]:
def rel_counts(word, ham_counts=ham_counts, spam_counts=spam_counts):
    print word
    print "spam count: ", spam_counts[word]
    print "ham count: ",  ham_counts[word]

In [24]:
rel_counts("ect")

ect
spam count:  10
ham count:  35223


In [25]:
rel_counts("enron")

enron
spam count:  0
ham count:  55334
