# Get the data

## Download data files

In [1]:
import os
import urllib
import tarfile
import urllib.request

download_root = "https://spamassassin.apache.org/old/publiccorpus/"
file_names = ["20030228_easy_ham.tar.bz2", "20030228_easy_ham_2.tar.bz2",
              "20030228_hard_ham.tar.bz2", "20030228_spam.tar.bz2",
              "20030228_spam_2.tar.bz2"]
store_path = os.path.join("data")

In [2]:
def fetch_data(root_url=download_root, file_names=file_names,
               store_path=store_path):
    
    # make directory storing emails
    os.makedirs(store_path, exist_ok=True)
    
    # download files 
    for file in file_names:
        file_url = os.path.join(download_root, file)
        path = os.path.join(store_path, file)
        urllib.request.urlretrieve(file_url, path)
    
    # extract emails
    for file in file_names:
        path = os.path.join(store_path, file)
        with tarfile.open(path, 'r') as f:
            f.extractall(path=store_path)

## Parse emails

In [3]:
# get file names of emails
email_folders = ["hard_ham", "easy_ham", "easy_ham_2", 
                 "spam", "spam_2"]
ham_names = {}
for ham in email_folders[:3]:
    ham_path = os.path.join(store_path, ham)
    names = [name for name in sorted(os.listdir(ham_path)) if len(name) > 20]
    ham_names[ham] = names
    
spam_names = {}
for spam in email_folders[3:]:
    spam_path = os.path.join(store_path, spam)
    names = [name for name in sorted(os.listdir(spam_path)) if len(name) > 20]
    spam_names[spam] = names

# parse emails
import email
import email.policy

def load_email(directory, filename, spam_path=store_path):
    path = os.path.join(spam_path, directory)
    with open(os.path.join(path, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
hams = []
for ham in email_folders[:3]:
    emails = [load_email(ham, filename=name) for name in ham_names[ham]]
    hams.extend(emails)
spams = []
for spam in email_folders[3:]:
    emails = [load_email(spam, filename=name) for name in spam_names[spam]]
    spams.extend(emails)

In [4]:
len(hams), len(spams), len(spams) / (len(hams) + len(spams))

(4150, 1897, 0.31370927732760046)

Accuracy of random guess is 70%, so we must do better than that.

# Take a look at the emails

**headers**

In [5]:
hams[1].items()

[('Return-Path', '<malcolm-sweeps@mrichi.com>'),
 ('Delivered-To', 'rod@arsecandle.org'),
 ('Received', '(qmail 16821 invoked by uid 505); 7 May 2002 14:37:01 -0000'),
 ('Received',
  'from malcolm-sweeps@mrichi.com by blazing.arsecandle.org\t by uid 500 with qmail-scanner-1.10 (F-PROT: 3.12. Clear:0. Processed in 0.260914 secs); 07 May 2002 14:37:01 -0000'),
 ('Delivered-To', 'rod-3ds@arsecandle.org'),
 ('Received', '(qmail 16811 invoked by uid 505); 7 May 2002 14:37:00 -0000'),
 ('Received',
  'from malcolm-sweeps@mrichi.com by blazing.arsecandle.org\t by uid 502 with qmail-scanner-1.10 (F-PROT: 3.12. Clear:0. Processed in 0.250416 secs); 07 May 2002 14:37:00 -0000'),
 ('Received',
  'from bocelli.siteprotect.com (64.41.120.21)  by h0090272a42db.ne.client2.attbi.com with SMTP; 7 May 2002 14:36:59 -0000'),
 ('Received',
  'from mail.mrichi.com ([208.33.95.187])\tby bocelli.siteprotect.com (8.9.3/8.9.3) with SMTP id JAA14328;\tTue, 7 May 2002 09:37:01 -0500'),
 ('From', 'malcolm-sweeps

In [6]:
hams[1]["Subject"]

'Malcolm in the Middle Sweepstakes Prize Notification'

**Contents**

In [7]:
print(hams[1].get_content()[:600])

May 7, 2002


Dear rod-3ds@arsecandle.org:


Congratulations!  On behalf of Frito-Lay, Inc., we are pleased to advise you
 that you've won Fourth Prize in the 3D's(R) Malcolm in the Middle(TM)
 Sweepstakes.   Fourth Prize consists of 1 manufacturer's coupon redeemable at
 participating retailers for 1 free bag of 3D's(R) brand snacks (up to 7 oz.
 size), with an approximate retail value of $2.59 and an expiration date of
 12/31/02.

Follow these instructions to claim your prize:

1.	Print out this email message.

2.	Complete ALL of the information requested.  Print clearly and legibly.  Sign
 


## Get email structure

There are some emails that have multiple parts.

In [8]:
hams[7].get_payload()

[<email.message.EmailMessage at 0x7fe5e373bb10>,
 <email.message.EmailMessage at 0x7fe5e54bcd10>]

In [9]:
from collections import Counter 

def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

def structure_counter(emails):
    structures = [get_email_structure(email) for email in emails]
    return Counter(structures)

In [10]:
structure_counter(hams).most_common()

[('text/plain', 3832),
 ('text/html', 120),
 ('multipart(text/plain, application/pgp-signature)', 101),
 ('multipart(text/plain, text/html)', 63),
 ('multipart(text/plain, text/plain)', 5),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/x-pkcs7-signature)', 2),
 ('multipart(text/html)', 2),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 2),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 2),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  2),
 ('multipart(text/plain, image/bmp)', 1),
 ('multipart(multipart(text/plain, text/html))', 1),
 ('multipart(text/plain, image/png, image/png)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/jpeg, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif)',
  1),
 ('multipart(text/pla

In [11]:
structure_counter(spams).most_common()

[('text/plain', 816),
 ('text/html', 772),
 ('multipart(text/plain, text/html)', 159),
 ('multipart(text/html)', 49),
 ('multipart(text/plain)', 44),
 ('multipart(multipart(text/html))', 23),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream)', 3),
 ('multipart(text/html, text/plain)', 3),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart/alternative', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain, 

It seems that most hams are plain text, while spams are more often html.

## Split emails into train and test set

In [12]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

In [13]:
X = np.array(hams+spams)
y = np.array([0] * len(hams) + [1] * len(spams))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=44, 
                                                    stratify=y)

In [15]:
X_train.shape, X_test.shape

((4837,), (1210,))

# Preprocessing emails

## Email to text

**Parse HTML**

In [16]:
from bs4 import BeautifulSoup

def html_to_plain_text(html):
    soup = BeautifulSoup(html, "lxml")
    strings = ""
    for i in soup.find_all():
        if i.string:
            strings += i.string + "\n"
    return strings

**Turn email to plain text**

In [17]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [18]:
example_spam = email_to_text(spams[10])
print(example_spam)

Cellular Phone Accessories All At Below Wholesale Prices!

http://202.101.163.34:81/sites/merchant/sales/

Hands Free Ear Buds 1.99! 
Phone Holsters 1.98! 
Booster Antennas Only $0.99
Phone Cases 1.98! 
Car Chargers 1.98! 
Face Plates As Low As 0.99! 
Lithium Ion Batteries As Low As 6.94! 

http://202.101.163.34:81/sites/merchant/sales/

Click Below For Accessories On All NOKIA, MOTOROLA LG, NEXTEL, 
SAMSUNG, QUALCOMM, ERICSSON, AUDIOVOX PHONES At Below 
WHOLESALE PRICES!

http://202.101.163.34:81/sites/merchant/sales/

***If You Need Assistance Please Call Us (732) 751-1457***


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To be removed from future mailings please send your remove 
request to: removemenow68994@btamail.net.cn 
Thank You and have a super day :)




## Replace url with "URL"

In [19]:
example_spam

'Cellular Phone Accessories All At Below Wholesale Prices!\n\nhttp://202.101.163.34:81/sites/merchant/sales/\n\nHands Free Ear Buds 1.99! \nPhone Holsters 1.98! \nBooster Antennas Only $0.99\nPhone Cases 1.98! \nCar Chargers 1.98! \nFace Plates As Low As 0.99! \nLithium Ion Batteries As Low As 6.94! \n\nhttp://202.101.163.34:81/sites/merchant/sales/\n\nClick Below For Accessories On All NOKIA, MOTOROLA LG, NEXTEL, \nSAMSUNG, QUALCOMM, ERICSSON, AUDIOVOX PHONES At Below \nWHOLESALE PRICES!\n\nhttp://202.101.163.34:81/sites/merchant/sales/\n\n***If You Need Assistance Please Call Us (732) 751-1457***\n\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nTo be removed from future mailings please send your remove \nrequest to: removemenow68994@btamail.net.cn \nThank You and have a super day :)\n\n'

In [20]:
import re
url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
example_spam = re.sub(url_pattern, "URL", example_spam)
example_spam

'Cellular Phone Accessories All At Below Wholesale Prices!\n\nURL\n\nHands Free Ear Buds 1.99! \nPhone Holsters 1.98! \nBooster Antennas Only $0.99\nPhone Cases 1.98! \nCar Chargers 1.98! \nFace Plates As Low As 0.99! \nLithium Ion Batteries As Low As 6.94! \n\nURL\n\nClick Below For Accessories On All NOKIA, MOTOROLA LG, NEXTEL, \nSAMSUNG, QUALCOMM, ERICSSON, AUDIOVOX PHONES At Below \nWHOLESALE PRICES!\n\nURL\n\n***If You Need Assistance Please Call Us (732) 751-1457***\n\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nTo be removed from future mailings please send your remove \nrequest to: removemenow68994@btamail.net.cn \nThank You and have a super day :)\n\n'

## Tokenize

In [21]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/hongpeiyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
example_spam_tokenized = word_tokenize(example_spam)
example_spam_tokenized[:10]

['Cellular',
 'Phone',
 'Accessories',
 'All',
 'At',
 'Below',
 'Wholesale',
 'Prices',
 '!',
 'URL']

## Stemming

In [23]:
import nltk

In [24]:
def stemming_email(tokenized_email):
    stemmer = nltk.PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokenized_email]
    return " ".join(stemmed_words)

In [25]:
stemmed_eamil = stemming_email(example_spam_tokenized)
stemmed_eamil

'cellular phone accessori all At below wholesal price ! url hand free ear bud 1.99 ! phone holster 1.98 ! booster antenna onli $ 0.99 phone case 1.98 ! car charger 1.98 ! face plate As low As 0.99 ! lithium ion batteri As low As 6.94 ! url click below for accessori On all nokia , motorola LG , nextel , samsung , qualcomm , ericsson , audiovox phone At below wholesal price ! url ***if you need assist pleas call Us ( 732 ) 751-1457*** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To be remov from futur mail pleas send your remov request to : removemenow68994 @ btamail.net.cn thank you and have a super day : )'

## Write a sklearn estimator to transform our email

In [26]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToTokenizedStemmed(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
                text = re.sub(url_pattern, "URL", text)
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'[^a-zA-Z0-9]+', ' ', text, flags=re.M)
            text = word_tokenize(text)
            text = stemming_email(text)
            X_transformed.append(text)
        return np.array(X_transformed)

## Count words

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Make Pipeline

In [28]:
from sklearn.pipeline import Pipeline

email_pipeline = Pipeline([
    ("Tokenizing and Stemming", EmailToTokenizedStemmed()),
    ("Count Vectorizing", CountVectorizer()),
    ("passthrough", None)
])

# tf-idf transformation
email_pipeline2 = Pipeline([
    ("Tokenizing and Stemming", EmailToTokenizedStemmed()),
    ("tf-idf Vectorizing", TfidfVectorizer()),
    ("passthrough", None)
])

In [29]:
X_train_processed = email_pipeline2.fit_transform(X_train)

In [30]:
X_test_processed = email_pipeline2.transform(X_test)

In [31]:
X_train_processed

<4837x34009 sparse matrix of type '<class 'numpy.float64'>'
	with 575215 stored elements in Compressed Sparse Row format>

In [32]:
X_test_processed

<1210x34009 sparse matrix of type '<class 'numpy.float64'>'
	with 135619 stored elements in Compressed Sparse Row format>

___

# Modeling

In [59]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost
from scipy.stats import uniform, randint, loguniform
import joblib

strat_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

### Naive Bayes

In [58]:
nb = MultinomialNB()
nb_cv_score = cross_val_score(nb, X_train_processed, 
                              y_train, cv=strat_cv, 
                              scoring="accuracy")

In [47]:
nb_cv_score.mean(), nb_cv_score.std()

(0.8751330356073439, 0.012127122581478123)

### Logistic regression

In [61]:
for c in [1, 5, 10, 20, 30, 50, 100, 200, 500]:
    logit = LogisticRegression(max_iter=5000, C=c)
    logit_cv_score = cross_val_score(logit, X_train_processed, 
                                     y_train, cv=strat_cv, 
                                     scoring="accuracy")
    print(f"C={c:3}, cross-validation accuracy: {logit_cv_score.mean():.4}, with std {logit_cv_score.std():.4}")

C=  1, cross-validation accuracy: 0.9597, with std 0.008905
C=  5, cross-validation accuracy: 0.9758, with std 0.007975
C= 10, cross-validation accuracy: 0.9816, with std 0.006035
C= 20, cross-validation accuracy: 0.983, with std 0.00591
C= 30, cross-validation accuracy: 0.9824, with std 0.005414
C= 50, cross-validation accuracy: 0.9835, with std 0.005921
C=100, cross-validation accuracy: 0.9828, with std 0.005067
C=200, cross-validation accuracy: 0.9824, with std 0.00426
C=500, cross-validation accuracy: 0.9802, with std 0.004998


In [62]:
for c in range(30, 41, 2):
    logit = LogisticRegression(max_iter=5000, C=c)
    logit_cv_score = cross_val_score(logit, X_train_processed, 
                                     y_train, cv=strat_cv, 
                                     scoring="accuracy")
    print(f"C={c:2}, cross-validation accuracy: {logit_cv_score.mean():.4}, with std {logit_cv_score.std():.4}")

C=30, cross-validation accuracy: 0.9824, with std 0.005414
C=32, cross-validation accuracy: 0.9824, with std 0.005414
C=34, cross-validation accuracy: 0.9824, with std 0.005414
C=36, cross-validation accuracy: 0.9822, with std 0.005251
C=38, cross-validation accuracy: 0.9822, with std 0.005645
C=40, cross-validation accuracy: 0.983, with std 0.005838


In [73]:
logit = LogisticRegression(max_iter=5000, C=36) # for test set
logit.fit(X_train_processed, y_train)

LogisticRegression(C=36, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## SVM

In [178]:
svc = SVC()
svc_params = {'C': loguniform(1e0, 1e3),
              'gamma': loguniform(1e-4, 1e-3),
              'kernel': ['rbf'],
              'class_weight':['balanced', None]}
svc_grid = RandomizedSearchCV(svc, svc_params, 
                              n_jobs=-1, cv=strat_cv,
                              n_iter=20)

In [179]:
svc_grid.fit(X_train_processed, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
                   error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=20, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f896ce62150>,
                                        'class_weight': ['balanced', None],
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f896ce62e90>,
                                        'kernel': ['rbf']},
               

In [181]:
svc_best = svc_grid.best_estimator_

In [182]:
joblib.dump(svc_best, "tmp/svc.pkl")

['tmp/svc.pkl']

In [54]:
svc = joblib.load("tmp/svc.pkl")

### Random Forest

In [93]:
rf = RandomForestClassifier(n_jobs=-1, oob_score=True, 
                            n_estimators=1000, random_state=42,
                            max_depth=100)
rf.fit(X_train_processed, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [94]:
rf.oob_score_

0.9712631796568121

In [95]:
joblib.dump(rf, "tmp/rf.pkl")

['tmp/rf.pkl']

In [96]:
rf = joblib.load("tmp/rf.pkl")

## Xgboost

In [65]:
xgb = xgboost.XGBClassifier()
xgb_params = {"n_estimators":randint(1, 1000),
              "learning_rate": uniform(0.001, 3),
              "max_depth":randint(1, 100)}
xgb_grid = RandomizedSearchCV(xgb, xgb_params, 
                              n_iter=20, scoring="accuracy",
                              n_jobs=-1, cv=strat_cv)

In [66]:
xgb_grid.fit(X_train_processed, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
                   error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='bin...
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe5987db0d0>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe5987

In [74]:
joblib.dump(xgb_grid.best_estimator_, "tmp/xgb.pkl")

['tmp/xgb.pkl']

In [84]:
xgb = joblib.load('tmp/xgb.pkl')