In [26]:
from typing import List
from collections import Counter
from naive_bayes import Message, NaiveBayesClassifier
import glob
from machineLearning import split_data, recall, precision
import random

In [7]:
path = 'spam_data/*/*'
data: List[Message] = []

In [8]:
for filename in glob.glob(path):
    is_spam = 'ham' not in filename
    with open(filename, errors='ignore') as email_file:
        for line in email_file:
            if line.startswith("Subject:"):
                subject = line.lstrip("Subject: ")
                data.append(Message(subject, is_spam))
                break

In [9]:
print(data[:4])

[Message(text='Re: New Sequences Window\n', is_spam=False), Message(text='[zzzzteana] RE: Alexander\n', is_spam=False), Message(text='[zzzzteana] Moscow bomber\n', is_spam=False), Message(text="[IRR] Klez: The Virus That  Won't Die\n", is_spam=False)]


# Spliting data with 85 of train data

In [34]:
random.seed(0)
train_messages, test_messages = split_data(data, 0.85)

In [35]:
model = NaiveBayesClassifier()

In [36]:
model.train(train_messages)
prediction = [(message, model.predict(message.text))
                for message in test_messages]

In [37]:
confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                            for message, spam_probability in prediction)

In [38]:
print(confusion_matrix)

Counter({(False, False): 408, (True, True): 56, (True, False): 19, (False, True): 12})


In [39]:
print(f"precision is {precision(73, 19)}")
print(f"Recall is {recall(73, 29)}")

precision is 0.7934782608695652
Recall is 0.7156862745098039
