In [69]:
import os 
import tarfile
import urllib.request
import email
import email.policy
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
import re
from html import unescape
import nltk
import urlextract
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import sparse
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

In [3]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

In [5]:
def fetch_spam_data(
        ham_url=HAM_URL,
        spam_url=SPAM_URL,
        spam_path=SPAM_PATH
):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

    return spam_path

In [6]:
fetch_spam_data()

'datasets/spam'

In [8]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")

In [9]:
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [10]:
len(ham_filenames), len(spam_filenames)

(2500, 500)

In [12]:
def load_email(
        is_spam,
        filename,
        spam_path=SPAM_PATH
):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [13]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [14]:
print(
    ham_emails[0].get_content().strip(),
)

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [15]:
print(
    spam_emails[0].get_content().strip(),
)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content="text/html; charset=windows-1252" http-equiv=Content-Type>
<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none 
style="COLOR: black; DISPLAY: none" width="100%">
  <TBODY>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT 
color=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Why Spend More Than You Have To?
<CENTER><FONT color=#ff0000 face="Copp

In [16]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [18]:
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [19]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [20]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [21]:
for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [24]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [65]:
X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
print([0] * len(ham_emails) + [1] * len(spam_emails))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [41]:
def html_to_plain_text(html):
    text = re.sub(r'<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub(r'<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub(r'<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [42]:
html_spam_emails = [email for email in X[y == 1]
                    if get_email_structure(email) == "text/html"]

sample_html_spam = html_spam_emails[7]

In [43]:
print(
    sample_html_spam.get_content().strip()[:1000], 
    "..."
)


<HTML><HEAD><TITLE>MILFhunter</TITLE>
<META http-equiv=Content-Type
content="text/html; charset=windows-1252"><HTTP-EQUIV="PRAGMA"
CONTENT="NO-CACHE">
<SCRIPT language=JavaScript>
<!-- hide from old browsers
	function loadPage(pageURL){
	location.href = pageURL.options[pageURL.selectedIndex].value
	}
//-->
</SCRIPT>

<META content="MSHTML 6.00.2716.2200" name=GENERATOR></HEAD>
<BODY text=#eaebec vLink=#ffffcc aLink=#ffffff link=#ffffcc bgColor=#647481
leftMargin=0 background="http://www.fromyou2.com/nasty/milf/bg.jpg"
topMargin=0>
<CENTER><BR>
  <CENTER>
    <CENTER>
      <FONT face=verdana><BR>
      </FONT>
      <CENTER>
        <TABLE cellPadding=15 bgColor=gray>
          <TBODY>
          <TR>
            <TD>
              <CENTER>
                <font color="black" face="verdana"><A
      onmouseover="window.status='MILFhunter.com - Do you know where your mom is?';return true"
      href="http://www.fromyou2.com/nasty/milf/milf/bindex.htm"><IMG
      src="http://www.fromyou2.

In [44]:
print(
    html_to_plain_text(sample_html_spam.get_content())[:1000], 
    "..."
)


                 HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                 HYPERLINK MILF HUNTER
                Do you know where your mom is?
                 HYPERLINK
                MORE SAMPLE PICS      MORE SAMPLE MOVIES      LIST OF MILFs
         
         HYPERLINK CLICK
          HERE to enlarge your PENIS 3-4 inches NATURALLY!!
         
         
         HYPERLINK Click
          Here to be removed
 ...


In [45]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [46]:
print(email_to_text(sample_html_spam)[:1000], "...")


                 HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                     HYPERLINK
                 HYPERLINK MILF HUNTER
                Do you know where your mom is?
                 HYPERLINK
                MORE SAMPLE PICS      MORE SAMPLE MOVIES      LIST OF MILFs
         
         HYPERLINK CLICK
          HERE to enlarge your PENIS 3-4 inches NATURALLY!!
         
         
         HYPERLINK Click
          Here to be removed
 ...


In [49]:
stemmer = nltk.PorterStemmer()

for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [53]:
url_extract = urlextract.URLExtract()
print(url_extract.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [55]:
class EmailToWordCounterTransformer(
    BaseEstimator,
    TransformerMixin
):
    def __init__(
            self,
            strip_headers=True,
            lower_case=True,
            remove_punctuation=True,
            replace_urls=True,
            replace_numbers=True,
            stemming=True
    ):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming


    def fit(self, X, y=None):
        return self     
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extract is not None:
                urls = list(set(url_extract.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)

                for url in urls:
                    text = text.replace(url, " URL ")

            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)

            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)

            word_counts = Counter(text.split())

            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts

            X_transformed.append(word_counts)

        return np.array(X_transformed)

In [56]:
x_few = X[:3]
x_few_wordcounts = EmailToWordCounterTransformer().fit_transform(x_few)
print(x_few_wordcounts)

[Counter({'number': 42, 'the': 15, 'pick': 9, 'lbrace': 6, 'rbrace': 6, 'i': 5, 'of': 5, 'list': 5, 'from': 4, 'is': 4, 'sequenc': 4, 'hit': 4, 'com': 3, 'thi': 3, 'inbox': 3, 'subject': 3, 'ftp': 3, 'mercuri': 3, 'command': 3, 'delta': 3, 'that': 3, 'version': 3, 'date': 2, 'deepeddi': 2, 't': 2, 'error': 2, 'exec': 2, 's': 2, 'come': 2, 'nmh': 2, 'use': 2, 'on': 2, 'url': 2, 'and': 2, 'mh_profil': 2, 'one': 2, 'exmh': 2, 'worker': 2, 'wed': 1, 'aug': 1, 'chri': 1, 'garrigu': 1, 'cwg': 1, 'numberfanumberd': 1, 'messag': 1, 'id': 1, 'tmda': 1, 'vircio': 1, 'can': 1, 'reproduc': 1, 'for': 1, 'me': 1, 'it': 1, 'veri': 1, 'repeat': 1, 'like': 1, 'everi': 1, 'time': 1, 'without': 1, 'fail': 1, 'debug': 1, 'log': 1, 'happen': 1, 'pick_it': 1, 'ftoc_pickmsg': 1, 'mark': 1, 'tkerror': 1, 'syntax': 1, 'in': 1, 'express': 1, 'int': 1, 'note': 1, 'if': 1, 'run': 1, 'by': 1, 'hand': 1, 'where': 1, 'obvious': 1, 'm': 1, 'compil': 1, 'at': 1, 'sun': 1, 'mar': 1, 'ict': 1, 'relev': 1, 'part': 1, 'my

In [61]:
class WordCounterToVectorTransformer(
    BaseEstimator,
    TransformerMixin
):
    
    def __init__(
            self,
            vocabulary_size=1000
    ):
        self.vocabulary_size = vocabulary_size

    def fit(self, X, y=None):
        total_count = Counter()
        
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)

        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {
            word: index + 1
            for index, (word, count) in enumerate(most_common)
        }
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        
        return sparse.csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [62]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
x_few_vectors = vocab_transformer.fit_transform(x_few_wordcounts)
print(x_few_vectors.toarray())

[[173  15  42   2   1   5   9   4   2   1   4]
 [ 86   5   4   3   3   3   0   2   3   3   2]
 [200  16   5  10  11   5   0   2   3   4   1]]


In [63]:
vocab_transformer.vocabulary_

{'the': 1,
 'number': 2,
 'and': 3,
 'to': 4,
 'of': 5,
 'pick': 6,
 'from': 7,
 's': 8,
 'a': 9,
 'is': 10}

In [66]:
process_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer())
])

X_train_transformed = process_pipeline.fit_transform(X_train)

In [67]:
log_clf = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)

score = cross_val_score(
    log_clf,
    X_train_transformed,
    y_train,
    cv=3,
    verbose=3
)

[CV] END ................................ score: (test=0.981) total time=   0.2s
[CV] END ................................ score: (test=0.984) total time=   0.1s
[CV] END ................................ score: (test=0.990) total time=   0.3s


In [68]:
score.mean()

np.float64(0.985)

In [71]:
X_test_transformed = process_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.88%
Recall: 97.89%
