In [66]:
import os
DOWNLOAD_ROOT = "https://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + '20030228_easy_ham.tar.bz2'
SPAM_PATH = os.path.join("datasets","spam")
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"

In [67]:
import urllib
import tarfile
def fetch_housing_data(spam_url = SPAM_URL,spam_path = SPAM_PATH, ham_url = HAM_URL):
    os.makedirs(spam_path, exist_ok=True)
    for file_name, url in (('ham.tar.bz2',ham_url),("spam.tar.bz2",spam_url)):
        tgz_path = os.path.join(spam_path, file_name)
        urllib.request.urlretrieve(url,tgz_path)
        spamtgz = tarfile.open(tgz_path)
        spamtgz.extractall(path=spam_path)
        spamtgz.close()
fetch_housing_data()

### Converting from MIME

In [68]:
import glob
ham_files = glob.glob('datasets/spam/easy_ham/*')
spam_files = glob.glob('datasets/spam/spam/*')
print(len(ham_files),len(spam_files))

2501 501


In [69]:
SPAM_PATH

'datasets/spam'

In [70]:
import email
import email.policy

def load_emails(is_spam,file_name, spam_path = SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory ,file_name),'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)


In [71]:
hams_emails = [load_emails(is_spam=False,file_name= name.split('/')[-1]) for name in ham_files]
spams_emails = [load_emails(is_spam=True,file_name= name.split('/')[-1]) for name in spam_files]

In [72]:

print(hams_emails[1].get_content().strip())

http://news.bbc.co.uk/1/hi/entertainment/showbiz/2308581.stm

Tuesday, 8 October, 2002, 07:55 GMT 08:55 UK
Lennon killer seeks parole again

The man who shot dead former Beatle John Lennon is making another bid for
early release from prison - the day before what would have been Lennon's
62nd birthday.
Mark David Chapman, 47, was jailed for life after he admitted killing the
superstar outside his New York apartment building in 1980.
It is the second time in two years that Chapman has sought parole from
Attica state prison.
At a 2000 hearing, he argued that he was no longer a danger to society and
had overcome the psychological problems which led him to shoot the
ex-Beatle.
Chapman had said that a voice in his head told him to shoot the star.
Shot dead
Lennon was shot four times as he emerged from a limousine outside his New
York City apartment on 8 December 1980.
He and his wife Yoko Ono were returning from a late-night recording session
during which time they had been working on Walkin

In [73]:
def get_email_structure(email):
    if isinstance(email, str):
        return email

    payload  = email.get_payload()
    if isinstance(payload,list):
        return "multipart{}".format(", ".join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()

In [74]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    
    return structures

In [75]:
structures_counter(hams_emails).most_common()

[('text/plain', 2409),
 ('multiparttext/plain, application/pgp-signature', 66),
 ('multiparttext/plain, text/html', 8),
 ('multiparttext/plain, text/plain', 4),
 ('multiparttext/plain', 3),
 ('multiparttext/plain, application/octet-stream', 2),
 ('multiparttext/plain, application/x-pkcs7-signature', 1),
 ('multiparttext/plain, video/mng', 1),
 ('multiparttext/plain, multiparttext/plain, text/plain, multipartmultiparttext/plain, application/x-pkcs7-signature',
  1),
 ('multiparttext/plain, application/x-java-applet', 1),
 ('multiparttext/plain, multiparttext/plain, text/plain, text/rfc822-headers',
  1),
 ('multiparttext/plain, application/ms-tnef, text/plain', 1),
 ('multipartmultiparttext/plain, text/plain, text/plain, application/pgp-signature',
  1),
 ('multiparttext/plain, text/enriched', 1),
 ('multiparttext/plain, multiparttext/plain', 1)]

In [76]:
structures_counter(spams_emails).most_common()

[('text/plain', 219),
 ('text/html', 183),
 ('multiparttext/plain, text/html', 45),
 ('multiparttext/html', 20),
 ('multiparttext/plain', 19),
 ('multipartmultiparttext/html', 5),
 ('multiparttext/plain, image/jpeg', 3),
 ('multiparttext/html, application/octet-stream', 2),
 ('multipart/alternative', 1),
 ('multipartmultiparttext/plain, text/html, image/gif', 1),
 ('multiparttext/html, text/plain', 1),
 ('multiparttext/plain, application/octet-stream', 1),
 ('multipartmultiparttext/html, application/octet-stream, image/jpeg', 1)]

In [77]:
for header, value in spams_emails[0].items():
    print(header,":",value)

Return-Path : <antheaygd@chinchilla.freeserve.co.uk>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (jalapeno [127.0.0.1])	by zzzzason.org (Postfix) with ESMTP id B141E16F03	for <zzzz@localhost>; Wed, 25 Sep 2002 00:17:07 +0100 (IST)
Received : from jalapeno [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Wed, 25 Sep 2002 00:17:07 +0100 (IST)
Received : from webnote.net (mail.webnote.net [193.120.211.219]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8ONBYC27124 for    <zzzz@jmason.org>; Wed, 25 Sep 2002 00:11:34 +0100
Received : from mail1.codetel.net.do (m30exfe1.codetel.net.do    [196.3.81.56]) by webnote.net (8.9.3/8.9.3) with ESMTP id AAA21947;    Wed, 25 Sep 2002 00:12:07 +0100
Received : from mail-in.pol.net.uk ([64.32.101.241]) by    mail1.codetel.net.do with Microsoft SMTPSVC(5.0.2195.5329); Tue,    24 Sep 2002 19:11:08 -0400
From : Ingrid Marksberry <antheaygd@chinchilla.freeserve.co.uk>
To : M

In [80]:
spams_emails[2]["Subject"]

'Free Shipping on all orders at Blair.com'

In [84]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(hams_emails + spams_emails, dtype= object)
y = np.array([0] * len(hams_emails) + [1] * len(spams_emails))

In [88]:
X_train, X_test, y_train, y_test = train_test_split( X, y,test_size= 0.2,random_state=42)

In [89]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('.*?', '', html, flags=re.M | re.S | re.I)
    text = re.sub('', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [90]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=euc-kr">
<title>요즘 뜨는 직종 Best 5 : 금융 / IT / 방송 / 뷰티 / 인테리어</title>
<meta name="generator" content="Namo WebEditor v5.0">
</head>
<BODY text=black vLink=purple aLink=red link=blue bgColor=white>
<TABLE cellSpacing=0 cellPadding=0 width=672 border=0>
<TBODY>
<TR>
<TD width=7 bgColor=#ffffff><IMG height=1
src="http://image.hanmail.net/hanmail/general/trans.gif" width=7><BR></TD>
<TD vAlign=top width="99%" bgColor=#ffffff><IMG height=105
src="http://image.hanmail.net/hanmail/s_img/recruit/sp_top01.gif"
width=221><IMG height=105
src="http://image.hanmail.net/hanmail/s_img/recruit/sp_top02.gif"
width=442>
<TABLE cellSpacing=0 cellPadding=0 width="100%" border=0>
<TBODY>
<TR>
<TD width=5><IMG height=8
src="http://image.hanmail.net/hanmail/s_img/recruit/g_raound01.gif"
width=5><BR></TD>
<TD width="97%" bgColor=#cbd940></TD>
<TD width=13><IMG height=8
src="http://image.hanmail.net/hanmail/s_img/recruit/g_raound02.gif"
wid

In [91]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

 HYPERLINK  ...


In [92]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [93]:
print(email_to_text(sample_html_spam)[:100], "...")


 HYPERLINK  ...


In [94]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [95]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [97]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [98]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'is': 7, 'number': 5, 's': 4, 'guarante': 4, 'a': 4, 'on': 3, 'at': 3, 'but': 3, 'i': 3, 'that': 3, 'no': 3, 'm': 3, 'the': 3, 'linux': 3, 'also': 2, 'about': 2, 'french': 2, 'consum': 2, 'law': 2, 'without': 2, 'so': 2, 'softwar': 2, 'deliv': 2, 'in': 2, 'these': 2, 'if': 2, 'thi': 2, 'not': 2, 'all': 2, 'with': 2, 'least': 2, 'which': 2, 'get': 2, 'you': 2, 'ie': 2, 'mon': 1, 'oct': 1, 'numberam': 1, 'ciaran': 1, 'johnston': 1, 'wrote': 1, 'there': 1, 'some': 1, 'stuff': 1, 'forbid': 1, 'sale': 1, 'of': 1, 'anyth': 1, 'as': 1, 'breach': 1, 'franc': 1, 'didn': 1, 't': 1, 'realli': 1, 'follow': 1, 'my': 1, 'bit': 1, 'iffi': 1, 'day': 1, 'true': 1, 'doe': 1, 'it': 1, 'nullifi': 1, 'microsoft': 1, 'adob': 1, 'and': 1, 'winzip': 1, 'licenc': 1, 'amongst': 1, 'most': 1, 'other': 1, 'claim': 1, 'liabil': 1, 'say': 1, 'fault': 1, 'they': 1, 'are': 1, 'honest': 1, 'appar': 1, 'angl': 1, 'e': 1, 'sell': 1, 'product': 1, 'sold': 1, 'servic': 1, 'licens': 1, 'what': 1, 'rememb': 

In [100]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [101]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 29 stored elements in Compressed Sparse Row format>

In [102]:
X_few_vectors.toarray()

array([[141,   3,   4,   7,   5,   3,   3,   4,   2,   2,   3],
       [148,  10,   5,   2,   6,   7,   3,   2,   4,   1,   0],
       [ 89,   9,   4,   3,   0,   1,   1,   0,   0,   3,   3]])

In [103]:
vocab_transformer.vocabulary_

{'the': 1,
 'a': 2,
 'is': 3,
 'number': 4,
 'i': 5,
 'that': 6,
 's': 7,
 'in': 8,
 'if': 9,
 'linux': 10}

In [104]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.986) total time=   0.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] END ................................ score: (test=0.986) total time=   0.5s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV] END ................................ score: (test=0.978) total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s finished


0.9833390553474822

In [106]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 99.01%
Recall: 96.15%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
