In [457]:
%matplotlib inline
import numpy as np
import nltk

In [40]:
from urllib.request import urlretrieve
import os
import tarfile

DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_FILE = '20030228_spam.tar.bz2'
HAM_FILE  = '20030228_easy_ham.tar.bz2'
DATA_PATH = 'datasets'

def fetch_spam_data():
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
    
    spam_path = os.path.join(DATA_PATH, SPAM_FILE)
    ham_path  = os.path.join(DATA_PATH, HAM_FILE)
    
    if not os.path.isfile(spam_path):
        urlretrieve(DOWNLOAD_ROOT + SPAM_FILE, spam_path)
            
    if not os.path.isfile(ham_path):
        urlretrieve(DOWNLOAD_ROOT + HAM_FILE, ham_path)
            
    file = tarfile.open(spam_path)
    file.extractall(path=DATA_PATH)
    file.close()
    
    file = tarfile.open(ham_path)
    file.extractall(path=DATA_PATH)
    file.close()

In [41]:
fetch_spam_data()

In [42]:
SPAM_DIR = os.path.join(DATA_PATH, 'spam')
HAM_DIR  = os.path.join(DATA_PATH, 'easy_ham')

spam_filenames = [filename for filename in sorted(os.listdir(SPAM_DIR)) if len(filename) > 10]
ham_filenames  = [filename for filename in sorted(os.listdir(HAM_DIR))  if len(filename) > 10]

In [43]:
import email

def parse_email(filename, is_spam=False):
    with open(os.path.join(SPAM_DIR if is_spam else HAM_DIR, filename), encoding='latin-1') as file:
        return email.message_from_file(file)

In [44]:
spam_emails = [parse_email(filename, is_spam=True)  for filename in spam_filenames]
ham_emails  = [parse_email(filename, is_spam=False) for filename in ham_filenames]

In [45]:
ham_emails[1604].get_content_type()

'multipart/signed'

In [46]:
from collections import Counter

content_types = [email.get_content_type() for email in ham_emails + spam_emails]
ctype_counter = Counter(content_types)
ctype_counter

Counter({'multipart/alternative': 56,
         'multipart/mixed': 53,
         'multipart/related': 12,
         'multipart/report': 2,
         'multipart/signed': 68,
         'text/html': 183,
         'text/plain': 2626})

In [162]:
def get_message_content(email):
    payload = email.get_payload()
    if isinstance(payload, str):
        return payload
    content = ''
    for part in payload:
        if isinstance(part, str):
            return ''.join(payload)
        content += get_message_content(part)
    return content

In [163]:
get_message_content(ham_emails[1604])

"Error 230 occurs when you report a signature, but the server doesn't\nknow about the signature, so it wants the full content.  It's\nbasically an optimization. =20\n\nBeyond that, I'm not sure how to interpret that output.. what version?\nVipul?\n\n--jordan\n\nOn Wed, Oct 02, 2002 at 11:59:01PM -0400, Rose, Bobby wrote:\n# What does this mean?  I set up procmailrc for a spamtrap but I'm getting\n# an error.  I also am reporting to pyzor and dcc and they aren't\n# registering an error.  What's weird is that it works sometimes.\n#=20\n# .\n# Oct 02 23:46:11.470523 report[14051]: [ 4] honor.cloudmark.com >> 20\n# Oct 02 23:46:11.470805 report[14051]: [ 6] response to sent.3\n# -res=3D1\n# err=3D230\n# .\n# Oct 02 23:46:11.471825 report[14051]: [ 5] mail 1, orig_email, special\n# case eng 1: Server accept\n# ed report.\n# Oct 02 23:46:11.472228 report[14051]: [ 8] mail 1.0, eng 4: err 230 -\n# server wants mail\n#=20\n#=20\n# -------------------------------------------------------\n# This

In [49]:
len(ham_emails), len(spam_emails)

(2500, 500)

In [314]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

(3000,)
(3000,)


In [315]:
X_train.shape, X_test.shape

((2400,), (600,))

In [316]:
y_train.shape, y_test.shape

((2400,), (600,))

In [317]:
html_emails = list(filter(lambda e: e.get_content_type() == 'text/html', ham_emails + spam_emails))
html_email = html_emails[0]

In [318]:
html_text = get_message_content(html_email)
html_text

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=\nype>\n<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\nules=3Dnone \nstyle=3D"COLOR: black; DISPLAY: none" width=3D"100%">\n  <TBODY>\n  <TR>\n    <TD colSpan=3D3>\n      <HR color=3Dblack noShade SIZE=3D1>\n    </TD></TR></TD></TR>\n  <TR>\n    <TD colSpan=3D3>\n      <HR color=3Dblack noShade SIZE=3D1>\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\n --><FONT \ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=\n0000 \nface=3D"Copperplate Gothic Bold" size=3D5 P

In [420]:
import re
import html
from urlextract import URLExtract
from nltk.stem.porter import PorterStemmer

extractor = URLExtract()
stemmer = PorterStemmer()

def remove_html_tags(text):
    text = re.sub('<.*?>', ' ', text, flags=re.M | re.S)
    text = re.sub('\n', ' ', text)
    text = re.sub('\+', ' ', text)
    text = re.sub('=|%|<|>', ' ', text)
    return html.unescape(text)

def remove_punctuation(text):
    return re.sub('\.|,|;|:|\?|!|-|\'|\/', ' ', text)

def remove_brackets(text):
    return re.sub('\[|\]', '', text)

def replace_urls(text):
    urls = extractor.find_urls(remove_brackets(text))
    for url in urls:
        text = text.replace(url, ' URL ')
    return text

def replace_numbers(text):
    return re.sub('\d+(:?\d+)*(\.\d+)*', ' NUMBER ', text)

def replace_emails(text):
    return re.sub('[\w\.-]+@[\w\.-]+', ' EMAIL ', text)

def remove_stem_words(tokens):
    return [stemmer.stem(token) for token in tokens]

In [421]:
remove_html_tags(html_text)

"                                                                                    Save up to 70  on Life Insurance.     Why Spend More Than You Have To?     Life Quote Savings                                                 Ensurin  g your        family's financial security is very important. Life Quote Savings ma  kes        buying life insurance simple and affordable. We Provide FREE Access   to The        Very Best Companies and The Lowest Rates.                                                        Life Quote Savings  is FAST, EAS  Y and              SAVES you money! Let us help you get started with the best val  ues in              the country on new coverage. You can SAVE hundreds or even tho  usands              of dollars by requesting a FREE quote from Lifequote Savings.   Our              service will take you less than 5 minutes to complete. Shop an  d              compare. SAVE up to 70  on all types of Life insurance!                                              Click 

In [422]:
remove_punctuation(remove_html_tags(html_text))

'                                                                                    Save up to 70  on Life Insurance      Why Spend More Than You Have To      Life Quote Savings                                                 Ensurin  g your        family s financial security is very important  Life Quote Savings ma  kes        buying life insurance simple and affordable  We Provide FREE Access   to The        Very Best Companies and The Lowest Rates                                                         Life Quote Savings  is FAST  EAS  Y and              SAVES you money  Let us help you get started with the best val  ues in              the country on new coverage  You can SAVE hundreds or even tho  usands              of dollars by requesting a FREE quote from Lifequote Savings    Our              service will take you less than 5 minutes to complete  Shop an  d              compare  SAVE up to 70  on all types of Life insurance                                               Click 

In [423]:
replace_numbers(remove_punctuation(remove_html_tags(html_text)))

'                                                                                    Save up to  NUMBER   on Life Insurance      Why Spend More Than You Have To      Life Quote Savings                                                 Ensurin  g your        family s financial security is very important  Life Quote Savings ma  kes        buying life insurance simple and affordable  We Provide FREE Access   to The        Very Best Companies and The Lowest Rates                                                         Life Quote Savings  is FAST  EAS  Y and              SAVES you money  Let us help you get started with the best val  ues in              the country on new coverage  You can SAVE hundreds or even tho  usands              of dollars by requesting a FREE quote from Lifequote Savings    Our              service will take you less than  NUMBER  minutes to complete  Shop an  d              compare  SAVE up to  NUMBER   on all types of Life insurance                                  

In [424]:
replace_urls(replace_numbers(remove_punctuation(remove_html_tags(html_text))))

'                                                                                    Save up to  NUMBER   on Life Insurance      Why Spend More Than You Have To      Life Quote Savings                                                 Ensurin  g your        family s financial security is very important  Life Quote Savings ma  kes        buying life insurance simple and affordable  We Provide FREE Access   to The        Very Best Companies and The Lowest Rates                                                         Life Quote Savings  is FAST  EAS  Y and              SAVES you money  Let us help you get started with the best val  ues in              the country on new coverage  You can SAVE hundreds or even tho  usands              of dollars by requesting a FREE quote from Lifequote Savings    Our              service will take you less than  NUMBER  minutes to complete  Shop an  d              compare  SAVE up to  NUMBER   on all types of Life insurance                                  

In [425]:
phrase = "Check out this link spamassasin@gmail.com, be quick!"
replace_emails(phrase)

'Check out this link  EMAIL , be quick!'

In [427]:
phrase = "If you are in receipt of this email in error and or wish to be removed from our list".split()
remove_stem_words(remove_stop_words(phrase))

['If', 'receipt', 'email', 'error', 'wish', 'remov', 'list']

In [446]:
from sklearn.base import BaseEstimator, TransformerMixin

class WordsToCounter(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 remove_html_tags=True,
                 remove_punctuation=True,
                 replace_urls=True,
                 replace_numbers=True,
                 replace_emails=True,
                 to_lowercase=True,
                 stem_words=True):
        self.remove_html_tags = remove_html_tags
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.replace_emails = replace_emails
        self.to_lowercase = to_lowercase
        self.stem_words = stem_words
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            raw_text = get_message_content(email)
            if self.to_lowercase:
                raw_text = raw_text.lower()
            if self.remove_html_tags:
                raw_text = remove_html_tags(raw_text)
            if self.replace_urls:
                raw_text = replace_urls(raw_text)
            if self.replace_numbers:
                raw_text = replace_numbers(raw_text)
            if self.replace_emails:
                raw_text = replace_emails(raw_text)
            if self.remove_punctuation:
                raw_text = remove_punctuation(raw_text)
            tokens = raw_text.split()
            if self.stem_words:
                tokens = remove_stem_words(tokens)
            X_transformed.append(Counter(tokens))
        return np.array(X_transformed)

In [465]:
from scipy.sparse.csr import csr_matrix

class WordsToVector(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=5000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, X, y=None):
        counter = Counter()
        for document in X:
            for word, count in document.items():
                counter[word] += count
        self.vocabulary_ = counter.most_common()[:self.vocabulary_size]
        self.words_ = [occurence[0] for occurence in self.vocabulary_]
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, document in enumerate(X):
            for word, count in document.items():
                col = self.words_.index(word) if word in self.words_ else 0
                cols.append(col)
                data.append(1)
                rows.append(row)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size))

In [466]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('counter', WordsToCounter()),
    ('vectorize', WordsToVector())
])

X_train_transformed = pipeline.fit_transform(X_train)
X_train_transformed.shape

(2400, 5000)

In [467]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression()
scores = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=True)
scores

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


array([0.985  , 0.98625, 0.98875])

In [468]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()
scores = cross_val_score(forest_clf, X_train_transformed, y_train, cv=3, verbose=True)
scores

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished


array([0.95375, 0.9675 , 0.975  ])

In [469]:
X_test_transformed = pipeline.fit_transform(X_test)

In [470]:
log_clf = LogisticRegression()
log_clf.fit(X_train_transformed, y_train)

print("Score on test set: {:.2f}".format(log_clf.score(X_test_transformed, y_test)))

Score on test set: 0.86


In [471]:
from sklearn.metrics import classification_report

y_log_clf_pred = log_clf.predict(X_test_transformed)

# overall f1 score is very low :/
print(classification_report(y_test, y_log_clf_pred))

             precision    recall  f1-score   support

          0       0.87      0.98      0.92       505
          1       0.66      0.20      0.31        95

avg / total       0.83      0.86      0.82       600

