# Spam filter implementation


### Read all files from folder into a list

In [1]:
import os
import codecs
 
def read_in(folder):
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        if not a_file.startswith("."):
            f = codecs.open(folder + a_file, 
                "r", encoding = "ISO-8859-1", errors="ignore")
            a_list.append(f.read())
            f.close()
    return a_list

### read spam and ham folders and check sizes

In [14]:
os.getcwd()
spam_list = read_in("c:/src/matej/manning-ml-as-microservice/part 2/enron/enron1/enron1/spam/") 
ham_list = read_in("c:/src/matej/manning-ml-as-microservice/part 2/enron/enron1/enron1/ham/")
print(len(spam_list)) 
print(len(ham_list))
print(spam_list[0])
print(ham_list[0])

1500
3672
Subject: dobmeos with hgh my energy level has gone up! Stukm
Introducing
Doctor - formulated
Hgh
Human growth hormone - also called hgh
Is referred to in medical science as the master hormone. It is very plentiful
When we are young, but near the age of twenty - one our bodies begin to produce
Less of it. By the time we are forty nearly everyone is deficient in hgh,
And at eighty our production has normally diminished at least 90 - 95%.
Advantages of hgh:
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swings
- new hair growth and color restore
Read
More at this website
Unsubscrib

### merge spam and ham lists into single list and shuffle

In [15]:
import random
 
all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
random.seed(42)
random.shuffle(all_emails)
print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 5172 emails


### try word tokenizer from nltk

In [21]:
import nltk
from nltk import word_tokenize
 
def tokenize(input):
    word_list = []
    for word in word_tokenize(input):
        word_list.append(word)
    return word_list
    
input = "What's the best way to split a sentence into words?"
print(tokenize(input))

['What', "'s", 'the', 'best', 'way', 'to', 'split', 'a', 'sentence', 'into', 'words', '?']


### Extract and normalize the features

In [26]:
def get_features(text):
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features
 
all_features = [(get_features(email), label) 
                 for (email, label) in all_emails]
 
print(get_features("Participate In Our New Lottery NOW!"))
print(len(all_features))
print(len(all_features[0][0]))
print(all_features[0][0])
print(len(all_features[99][0]))

{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}
5172
27
{'subject': True, ':': True, 'bloodline': True, ',': True, 'ahead': True, 'of': True, 'the': True, 'street': True, 'microcap': True, 'alert': True, 'when': True, 'living': True, 'with': True, 'sheriff': True, 'is': True, 'obsequious': True, 'blood': True, 'clot': True, 'beyond': True, 'deficit': True, 'reach': True, 'an': True, 'understanding': True, 'toward': True, '.': True, '[': True, '3': True}
53


### Train the classifier

In [28]:
from nltk import NaiveBayesClassifier, classify
 
def train(features, proportion):
    train_size = int(len(features) * proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print (f"Training set size = {str(len(train_set))} emails")
    print (f"Test set size = {str(len(test_set))} emails")
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
 
train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4137 emails
Test set size = 1035 emails


In [29]:
def evaluate(train_set, test_set, classifier):
    print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_set))}")
    print (f"Accuracy of the test set = {str(classify.accuracy(classifier, test_set))}")
    classifier.show_most_informative_features(50)
 
evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9613246313753928
Accuracy of the test set = 0.9420289855072463
Most Informative Features
               forwarded = True              ham : spam   =    198.3 : 1.0
                    2004 = True             spam : ham    =    143.8 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
            prescription = True             spam : ham    =    122.9 : 1.0
                    pain = True             spam : ham    =     98.8 : 1.0
                  health = True             spam : ham    =     82.7 : 1.0
                     ect = True              ham : spam   =     76.8 : 1.0
                    2001 = True              ham : spam   =     75.8 : 1.0
                featured = True             spam : ham    =     74.7 : 1.0
              nomination = True              ham : spam   =     72.1 : 1.0
             medications = True             spam : ham    =     69.9 : 1.0
                  differ = True             spam : ha