In [31]:
import os
import re
import string
import math
import pickle
import sys
from random import randint

DATA_DIR = 'enron'
target_names = ['ham', 'spam']

def get_data(DATA_DIR, arq):
    #subfolders = ['enron%d' % i for i in range(1,2)]
    subfolders = ['enron%d' % arq]

    data = []
    target = []
    for subfolder in subfolders:
        # spam
        spam_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'spam'))
        for spam_file in spam_files:
            #original:
            #with open(os.path.join(DATA_DIR, subfolder, 'spam', spam_file), encoding="latin-1") as f:
            with open(os.path.join(DATA_DIR, subfolder, 'spam', spam_file)) as f:
                data.append(f.read())
                target.append(1)

        # ham
        ham_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'ham'))
        for ham_file in ham_files:
            #original:
            #with open(os.path.join(DATA_DIR, subfolder, 'ham', ham_file), encoding="latin-1") as f:
            with open(os.path.join(DATA_DIR, subfolder, 'ham', ham_file)) as f:    
                data.append(f.read())
                target.append(0)

    return data, target

class SpamDetector(object):
    """Implementation of Naive Bayes for binary classification"""
    def clean(self, s):
        return s.translate(None, string.punctuation)
        #original:
        #translator = str.maketrans("", "", string.punctuation)
        #return s.translate(translator)

    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)

    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        """Fit our classifier
        Arguments:
            X {list} -- list of document contents
            y {list} -- correct labels
        """
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        n = 1.0*len(X)
        self.log_class_priors['spam'] = math.log(sum(1 for label in Y if label == 1) / n)
        self.log_class_priors['ham'] = math.log(sum(1 for label in Y if label == 0) / n)
        self.word_counts['spam'] = {}
        self.word_counts['ham'] = {}

        for x, y in zip(X, Y):
            c = 'spam' if y == 1 else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            for word, count in counts.items():
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count

    def predict(self, X, use_laplace_smoothing, laplace_smoothing_term, print_unknown_words_spam, print_unknown_words_ham):
        result = []
        laplace_sum_1 = sum(self.word_counts['spam'].values()) + laplace_smoothing_term*len(self.vocab)
        laplace_sum_2 = sum(self.word_counts['ham'].values()) + laplace_smoothing_term*len(self.vocab)

        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0

            spam_zero_frequency_word = False # does the email has a zero-frequency word in the spam class? Then P(spam)=0 if not using LS
            ham_zero_frequency_word = False # does the email has a zero-frequency word in the ham class? Then P(ham)=0 if not using LS
            
            for word, _ in counts.items():
                if word not in self.vocab: continue
                                
                if use_laplace_smoothing:
                    log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + laplace_smoothing_term) / (laplace_sum_1) )
                    log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + laplace_smoothing_term) / (laplace_sum_2) )
                else:
                    if self.word_counts['spam'].get(word, 0.0)==0:
                        if print_unknown_words_spam:
                            print "\tPalavra --%s--- tem contagem zero na classe SPAM" % word
                        spam_zero_frequency_word = True
                        log_w_given_spam = 0.0
                    else:
                        log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + laplace_smoothing_term) / (laplace_sum_1) )
                        
                    if self.word_counts['ham'].get(word, 0.0)==0:
                        if print_unknown_words_ham:
                            print "\tPalavra --%s--- tem contagem zero na classe NAO-SPAM" % word
                        ham_zero_frequency_word = True
                        log_w_given_ham = 0.0
                    else:
                        log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + laplace_smoothing_term) / (laplace_sum_2) )    

                spam_score += log_w_given_spam
                ham_score += log_w_given_ham

            spam_score += self.log_class_priors['spam']
            ham_score += self.log_class_priors['ham']
            
            if spam_zero_frequency_word:
                spam_score = 0.0 # if no laplace smoothing was used and there was a zero-frequency word in the email, P(spam)=0
            if ham_zero_frequency_word:
                ham_score = 0.0  # if no laplace smoothing was used and there was a zero-frequency word in the email, P(ham)=0
                
            if spam_score == ham_score:
                result.append(randint(0,1))
            else:
                if spam_score > ham_score:
                    result.append(1)
                else:
                    result.append(0)
        return result
        

        
        
        

#------PARAMETROS AJUSTAVEIS-------------
# dataset = 'desbalanceado'
dataset = 'balanceado'

mostra_palavras_mais_frequentes_spams = False
mostra_palavras_mais_frequentes_nao_spams  = False

use_laplace_smoothing = True
laplace_smoothing = 10.0

mostra_palavras_frequencia_zero_dentre_spams = False # acionavel quando nao se utiliza Laplace Smoothing
mostra_palavras_frequencia_zero_dentre_nao_spams  = False # acionavel quando nao se utiliza Laplace Smoothing

#------------------------------------------


if use_laplace_smoothing:
    print "(Utilizando Laplace smoothing %.5f)" % laplace_smoothing
else:
    print "(Nao utilizando Laplace smoothing)"

print "Carregando dataset %s" % dataset
[X, y, MNB] = pickle.load( open( "dataset_enron_%s.p" % dataset, "rb" ) )
nSpams = sum(y)
nTotal = len(y)
print "\tDataset possui %d spams de um total de %d emails (percentual de spams: %.2f%%)" % (nSpams, nTotal, 100*nSpams/nTotal)

if mostra_palavras_mais_frequentes_spams:
    spam_counts = MNB.word_counts['spam']
    most_common_spam_words = sorted(spam_counts.items(), key=lambda item: item[1])
    print "Palavras mais comuns em emails do tipo SPAM:"
    for (palavra, contagem) in most_common_spam_words[-51:-70:-1]:
        print "\t%s: %d vezes" % (palavra, contagem)

if mostra_palavras_mais_frequentes_nao_spams:                
    ham_counts = MNB.word_counts['ham']
    most_common_ham_words = sorted(ham_counts.items(), key=lambda item: item[1])
    print "Palavras mais comuns em emails do tipo NAO-SPAM:"
    for (palavra, contagem) in most_common_ham_words[-40:-69:-1]:
        print "\t%s: %d vezes" % (palavra, contagem)


print "Fazendo predicoes no conjunto de teste..."
pred = MNB.predict(X[:100], use_laplace_smoothing, laplace_smoothing, mostra_palavras_frequencia_zero_dentre_spams, mostra_palavras_frequencia_zero_dentre_nao_spams)
#print "\tDone predicting"
true = y[:100]

emails_corretos = sum(1 for i in range(len(pred)) if pred[i] == true[i])
total_emails    = len(pred)
acuracia = emails_corretos/float(total_emails)
print "Acuracia: %.4f%%\t(%d of %d)" % (100*acuracia, emails_corretos, total_emails)


(Utilizando Laplace smoothing 10.00000)
Carregando dataset balanceado
	Dataset possui 3675 spams de um total de 5175 emails (percentual de spams: 71.00%)
Fazendo predicoes no conjunto de teste...
Acuracia: 100.0000%	(100 of 100)
