In [1]:
import os

DATASETS_DIR = 'datasets'
MODELS_DIR = 'models'
TAR_DIR = os.path.join(DATASETS_DIR, 'tar')

SPAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
EASY_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
HARD_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'

In [2]:
from urllib.request import urlretrieve
import tarfile
import shutil

def download_dataset(url):
    """download and unzip data from a url into the specified path"""
    
    # create directory if it doesn't exist
    if not os.path.isdir(TAR_DIR):
        os.makedirs(TAR_DIR)
    
    filename = url.rsplit('/', 1)[-1]
    tarpath = os.path.join(TAR_DIR, filename)
    
    # download the tar file if it doesn't exist
    try:
        tarfile.open(tarpath)
    except:
        urlretrieve(url, tarpath)
    
    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(DATASETS_DIR, tar.getnames()[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=DATASETS_DIR)
        
        cmds_path = os.path.join(dirname, 'cmds')
        if os.path.isfile(cmds_path):
            os.remove(cmds_path)
    
    return dirname

In [3]:
spam_dir = download_dataset(SPAM_URL)
easy_ham_dir = download_dataset(EASY_HAM_URL)
hard_ham_dir = download_dataset(HARD_HAM_URL)

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

In [9]:
easy_ham_filenames = [name for name in sorted(os.listdir(easy_ham_dir)) if len(name) > 20]
hard_ham_filenames = [name for name in sorted(os.listdir(hard_ham_dir)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(spam_dir)) if len(name) > 20]

In [10]:
len(easy_ham_filenames)

1400

In [11]:
len(hard_ham_filenames)

250

In [12]:
len(spam_filenames)

1396

In [17]:
spam_dir

'datasets/spam_2'

In [20]:
import email
import email.policy

def load_email(is_spam, filename, spam_path='datasets'):
    directory = "spam_2" if is_spam else "easy_ham_2"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [21]:
easy_ham_emails = [load_email(is_spam=False, filename=name) for name in easy_ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [24]:
easy_ham_emails[0].get_content().strip()

'Date:        Tue, 20 Aug 2002 17:27:47 -0500\n    From:        Chris Garrigues <cwg-exmh@DeepEddy.Com>\n    Message-ID:  <1029882468.3116.TMDA@deepeddy.vircio.com>\n\n\n  | I\'m hoping that all people with no additional sequences will notice are\n  | purely cosmetic changes.\n\nWell, first, when exmh (the latest one with your changes) starts, I get...\n\ncan\'t read "flist(totalcount,unseen)": no such element in array\n    while executing\n"if {$flist(totalcount,$mhProfile(unseen-sequence)) > 0} {\n\tFlagInner spool iconspool labelup\n    } else {\n\tFlagInner down icondown labeldown\n    }"\n    (procedure "Flag_MsgSeen" line 3)\n    invoked from within\n"Flag_MsgSeen"\n    (procedure "MsgSeen" line 8)\n    invoked from within\n"MsgSeen $msgid"\n    (procedure "MsgShow" line 12)\n    invoked from within\n"MsgShow $msgid"\n    (procedure "MsgChange" line 17)\n    invoked from within\n"MsgChange 4862 show"\n    invoked from within\n"time [list MsgChange $msgid $show"\n    (procedure "M

In [28]:
spam_emails[10].get_content().strip()

'Yes we do purchase uncollected Judicial Judgements!!!            st10                           .           \n\nIf you, your company or an acquaintance have an uncollected Judicial Judgement then please call us and find out how we can help you receive the money that the court states you are rightfully due.\n\nWe have strong interest in acquiring uncollected Judicial Judgements in your City and Area.\n\nJ T C is the largest firm in the world specializing in the purchase and collection of Judicial Judgements.\n\nCurrently we are processing over 455 million dollars worth of judgements in the United States alone. We have associate offices in virtually every city in the US and in most foreign countries.\n\nYou have nothing to lose and everything to gain by calling. There is absolutely no cost to you.\n\nWe can be reached Toll free at 1-888-557-5744. in the US or if you are in Canada call 1-310-842-3521. You can call 24 hours per day.\n\nThank you for your time.\n\n\n\n\n\n\n\n+++++++++++++

In [29]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [31]:
from collections import Counter
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [32]:
structures_counter(easy_ham_emails).most_common()

[('text/plain', 1343),
 ('multipart(text/plain, application/pgp-signature)', 35),
 ('multipart(text/plain, text/html)', 12),
 ('text/html', 2),
 ('multipart(text/plain, application/x-patch)', 1),
 ('multipart(multipart(text/plain, multipart(text/plain), text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/gif, image/gif, image/gif, image/gif)',
  1),
 ('multipart(text/plain, application/ms-tnef)', 1),
 ('multipart(text/plain, text/plain, text/plain)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1)]

In [33]:
structures_counter(spam_emails).most_common()

[('text/plain', 597),
 ('text/html', 589),
 ('multipart(text/plain, text/html)', 114),
 ('multipart(text/html)', 29),
 ('multipart(text/plain)', 25),
 ('multipart(multipart(text/html))', 18),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(text/html, text/plain)', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/gif)',
  1),
 ('text/plain charset=us-ascii', 1),
 ('multipart(multipart(text/html), image/gif)', 1),
 ('multipart(multipart(text/plain, text/html), application/octet-stream, application/octet-stream, applic