In [144]:
import os
import numpy as np
import random
import math
from collections import Counter

In [145]:
def make_Dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]
    print(len(emails))
    all_words = []
    for mail in emails:
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:  #Body of email is only 3rd line of text file
                    words = line.split()
                    all_words += words
    dictionary = Counter(all_words)
    dictionary_temp = Counter(all_words)

    # Paste code for non-word removal here(code snippet is given below)
    list_to_remove = dictionary_temp.keys()
    for item in list_to_remove:
        if item.isalpha() == False:    #Determine whether it is punctuation
            del dictionary[item]
        elif len(item) == 1:           #
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    return dictionary

In [146]:
def extract_features(root_dir,dictionary):
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    all_words = []
    features_matrix = np.zeros((len(emails),3000))
    docID = 0
    for mail in emails:
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
            docID = docID + 1
    return features_matrix



In [147]:
Path = "train-mails"
dir = make_Dictionary(Path)

print(dir[0:5])
train_matrix = extract_features(Path,dir)
Path_test = "test-mails"
dir_test = make_Dictionary(Path_test)
test_matrix = extract_features(Path_test,dir_test)

702
[('order', 1414), ('address', 1293), ('report', 1216), ('mail', 1127), ('send', 1079)]
260


In [148]:
class NaiveBayes:
#Divide the sample
    def separate_by_class(self,data):
        separated = {}
        for i in range(len(data)):
            vector = data[i]
            if (vector[-1] not in separated):
                separated[vector[-1]] = []
            separated[vector[-1]].append(vector)
        return separated
        
#Computational features
    def mean(self,data):
        return sum(data)/float(len(data))
    def stdev(self,data):
        avg = self.mean(data)
        variance = sum([pow(x-avg,2) for x in data])/float(len(data) - 1)
        return math.sqrt(variance)
    def summarize(self,data):
        summaries = [(self.mean(attribute),self.stdev(attribute)) for attribute in zip(*data)]
        del summaries[-1]
        return summaries

#Extract attribute features by category
    def summarize_by_class(self,data):
        separated = self.separate_by_class(data)
        summarizes = {}
        keyList = list(separated.keys())
        for classValue in keyList:
            summarizes[classValue] = self.summarize(separated[classValue])
        return summarizes

    def calculate_probability(self,x,mean,stdev):
        if stdev == 0:
            return 0
        exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

    def calculate_class_probabilities(self,summaries, inputVector):
        probabilities = {}
        keyList = list(summaries.keys())
        for classValue in keyList:
            probabilities[classValue] = 1
            for i in range(len(summaries[classValue])):
                mean,stdev = summaries[classValue][i]
                x = inputVector[i]
                probabilities[classValue] *= self.calculate_probability(x, mean, stdev)
        return probabilities

    def predict(self,summaries,inputVector):
        probabilities = self.calculate_class_probabilities(summaries,inputVector)
        bestLabel, bestProb = None, -1
        keyList = list(probabilities.keys())
        for classValue in keyList:
            if bestLabel is None or probabilities[classValue] > bestProb:
                bestProb = probabilities[classValue]
                bestLabel = classValue
        return bestLabel
    def get_prediction(self,summaries, testSet):
        predictions = []
        for i in range(len(testSet)):
            result = self.predict(summaries, testSet[i])
            predictions.append(result)
        return predictions

    def get_accuracy(self,data,predictions):
        correct = 0
        for x in range(len(data)):
            if data[x][-1] == predictions[x]:
                correct += 1
        return (correct/float(len(data))) * 100

In [152]:
train_labels = np.zeros(train_matrix.shape[0])
train_labels[351:] = 1
# print(train_labels)
# print(train_matrix.shape)
NB = NaiveBayes()
train_labels = train_labels[:,np.newaxis]
data_test = np.hstack((train_matrix,train_labels))
summaries = NB.summarize_by_class(data_test)
predictions = NB.get_prediction(summaries,data_test)
print(predictions)
accuracy = NB.get_accuracy(data_test,predictions)
print(accuracy)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [154]:
print(summaries)

), (0.0, 0.0), (0.008547008547008548, 0.11921599813973957), (0.0, 0.0), (0.014245014245014245, 0.11866850501381614), (0.0, 0.0), (0.011396011396011397, 0.10629360892514285), (0.011396011396011397, 0.10629360892514311), (0.0, 0.0), (0.0, 0.0), (0.014245014245014245, 0.11866850501381618), (0.008547008547008548, 0.09218551132454918), (0.022792022792022793, 0.42700841014689944), (0.019943019943019943, 0.14000406994491132), (0.011396011396011397, 0.1062936089251428), (0.0, 0.0), (0.011396011396011397, 0.13043242316470613), (0.002849002849002849, 0.05337605126836252), (0.022792022792022793, 0.1986933302328976), (0.02564102564102564, 0.2867015908532811), (0.005698005698005698, 0.07537722256574367), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.02849002849002849, 0.16660560542018937), (0.019943019943019943, 0.14000406994491138), (0.0, 0.0), (0.02564102564102564, 0.20542103640523895), (0.02849002849002849, 0.2710566609986067), (0.019943019943019943, 0.21948250083450327), (0.005698005698005698, 0.07537

In [160]:
separated = NB.separate_by_class(data_test)
print(separated.keys())

dict_keys([0.0, 1.0])
