In [2]:
import codecs
import os

def read_in(folder):
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        if not a_file.startswith("."):
            f = codecs.open(os.path.join(folder, a_file), "r", encoding = "ISO-8859-1", errors="ignore")
            a_list.append(f.read())
            f.close()

    return a_list

In [3]:
spam_list = read_in("enron1/spam")
ham_list = read_in("enron1/ham")

In [4]:
import random

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
random.seed(42)
random.shuffle(all_emails)
print(f"Dataset size = {len(all_emails)} emails")

Dataset size = 5172 emails


In [5]:
import nltk
from nltk import word_tokenize
nltk.download("punkt_tab")

def get_features(text):
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features

all_features = [(get_features(email), label) for (email, label) in all_emails]

print(get_features("Participate in Our New Lottery NOW!"))
print(len(all_features), len(all_features[0][0]), len(all_features[99][0]))


[nltk_data] Downloading package punkt_tab to /home/marc/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}
5172 104 75


In [8]:
from nltk import NaiveBayesClassifier, classify

def train(features, proportion):
    train_size = int(len(features) * proportion)
    train_set = features[:train_size]
    test_set = features[train_size:]
    print(f"Training set size = {len(train_set)} emails")
    print(f"Test set size = {len(test_set)} emails")

    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier

train_set, test_set, classifier = train(all_features, 0.8)


Training set size = 4137 emails
Test set size = 1035 emails


In [12]:
def evaluate(train_set, test_set, classifier):
    print(f"Accuracy on the training set = {classify.accuracy(classifier, train_set)}")
    print(f"Accuracy on the test set = {classify.accuracy(classifier, test_set)}")
    classifier.show_most_informative_features(50)

evaluate(train_set, test_set, classifier)


Accuracy on the training set = 0.9584239787285472
Accuracy on the test set = 0.9400966183574879
Most Informative Features
               forwarded = True              ham : spam   =    196.6 : 1.0
                    2004 = True             spam : ham    =    140.5 : 1.0
            prescription = True             spam : ham    =    124.5 : 1.0
                     nom = True              ham : spam   =    124.4 : 1.0
                    pain = True             spam : ham    =    106.8 : 1.0
                   cheap = True             spam : ham    =     92.4 : 1.0
                    spam = True             spam : ham    =     92.4 : 1.0
                     sex = True             spam : ham    =     81.1 : 1.0
                featured = True             spam : ham    =     77.9 : 1.0
                     ect = True              ham : spam   =     75.4 : 1.0
              nomination = True              ham : spam   =     73.7 : 1.0
                creative = True             spam : ha

In [13]:
from nltk.text import Text

def concordance(data_list, search_word):
    for email in data_list:
        word_list = [word for word in word_tokenize(email.lower())]
        text_list = Text(word_list)
        if search_word in word_list:
            text_list.concordance(search_word)

print("STOCKS in HAM:")
concordance(ham_list, "stocks")

print("\n\nSTOCKS in SPAM:")
concordance(spam_list, "stocks")


STOCKS in HAM:
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than


STOCKS in SPAM:
Displaying 1 of 1 matches:
ne trade thursday ! go fcdh . penny stocks are considered highiy specuiative a
Displaying 2 of 2 matches:
ng their gains . select gold mining stocks are the hot flyers of the otc . his
is letter cautions that micro - cap stocks are high - risk investments and tha
Displaying 3 of 3 matches:
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
 lose money from investing in penny stocks . if you wish to stop fu

In [15]:
test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
test_ham_list = ["See the minutes from the last meeting attached", "Investors are coming to our office on Monday"]

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]
new_test_set = [(get_features(email), label) for (email, label) in test_emails]
evaluate(train_set, new_test_set, classifier)


Accuracy on the training set = 0.9584239787285472
Accuracy on the test set = 1.0
Most Informative Features
               forwarded = True              ham : spam   =    196.6 : 1.0
                    2004 = True             spam : ham    =    140.5 : 1.0
            prescription = True             spam : ham    =    124.5 : 1.0
                     nom = True              ham : spam   =    124.4 : 1.0
                    pain = True             spam : ham    =    106.8 : 1.0
                   cheap = True             spam : ham    =     92.4 : 1.0
                    spam = True             spam : ham    =     92.4 : 1.0
                     sex = True             spam : ham    =     81.1 : 1.0
                featured = True             spam : ham    =     77.9 : 1.0
                     ect = True              ham : spam   =     75.4 : 1.0
              nomination = True              ham : spam   =     73.7 : 1.0
                creative = True             spam : ham    =     71.5

In [17]:
for email in test_spam_list:
    print(email)
    print(classifier.classify(get_features(email)))
for email in test_ham_list:
    print(email)
    print(classifier.classify(get_features(email)))
    

Participate in our new lottery!
spam
Try out this new medicine
spam
See the minutes from the last meeting attached
ham
Investors are coming to our office on Monday
ham


In [None]:
while True:
    email = input("Type in your email here (or press 'Enter'): ")
    if len(email) == 0:
        break
    else:
        prediction = classifier.classify(get_features(email))
        print(f"This email is likely {prediction}\n")


Type in your email here (or press 'Enter'):  Great prices


This email is likely spam

