In [1]:
import os
import tarfile
import urllib

In [2]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

In [3]:
def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [None]:
fetch_spam_data()

In [None]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [None]:
import email
import email.policy

In [None]:
def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [None]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [None]:
print(ham_emails[1].get_content().strip())

In [18]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [19]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [20]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [21]:
structures_counter(ham_emails).most_common()
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [22]:
import numpy as np

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=4, random_state=42)

In [26]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)


In [27]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html><body><pre>

______________________________________________________________________
______________________________________________________________________

LOWEST RATES IN 45 YEARS -
FILL OUT OUR SHORT APPLICATION FOR AN UNBELIEVABLE 3.50 - 5.0 % MORTGAGE
(APR)

  () HOME REFINANCING
  () HOME IMPROVEMENT
  () DEBT-CONSOLIDATION
  () CASH-OUT

Please Click <a href="http://210.192.106.2/mg/index.html">HERE</a>
for our short application.

The following are NO problem and will not stop you from getting the
financing you need:

  *** Can't show income
  *** Self-Employed
  *** Credit Problems
  *** Recent Bankruptcy
  *** Unconventional Loan

We are a direct lender and we have hundreds of loan programs available.
If we don't have a program that works for you, we have hundreds of wholesale
relationships with other lenders. So no matter which of our
50 states you live in, we likely have a program that could meet your
needs.

Please Click <a href="http://210.192.106.2/mg/index.html">HER

In [28]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


______________________________________________________________________
______________________________________________________________________
LOWEST RATES IN 45 YEARS -
FILL OUT OUR SHORT APPLICATION FOR AN UNBELIEVABLE 3.50 - 5.0 % MORTGAGE
(APR)
  () HOME REFINANCING
  () HOME IMPROVEMENT
  () DEBT-CONSOLIDATION
  () CASH-OUT
Please Click  HYPERLINK HERE
for our short application.
The following are NO problem and will not stop you from getting the
financing you need:
  *** Can't show income
  *** Self-Employed
  *** Credit Problems
  *** Recent Bankruptcy
  *** Unconventional Loan
We are a direct lender and we have hundreds of loan programs available.
If we don't have a program that works for you, we have hundreds of wholesale
relationships with other lenders. So no matter which of our
50 states you live in, we likely have a program that could meet your
needs.
Please Click  HYPERLINK HERE
for our short application.
* We DO NOT resell or disseminate your email address. You are NOT
re

In [29]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [30]:
import nltk
from sklearn.base import BaseEstimator, TransformerMixin

url_extractor = None

stemmer = nltk.PorterStemmer()

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [31]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'number': 30, 'the': 20, 'you': 14, 'i': 14, 'in': 12, 'to': 11, 'thi': 11, 'and': 11, 'a': 7, 'your': 7, 'are': 6, 'day': 6, 't': 5, 'for': 5, 'is': 5, 'over': 5, 'of': 5, 'don': 4, 'with': 4, 'be': 4, 'we': 4, 'receiv': 4, 'call': 3, 'or': 3, 's': 3, 'fl': 3, 'not': 3, 'have': 3, 'by': 3, 'email': 3, 'want': 2, 'get': 2, 'player': 2, 'on': 2, 'one': 2, 'again': 2, 'do': 2, 'first': 2, 've': 2, 'last': 2, 'sent': 2, 'my': 2, 'back': 2, 'l': 2, 'money': 2, 'c': 2, 'me': 2, 'free': 2, 'lead': 2, 'train': 2, 'will': 2, 'fax': 2, 'immedi': 2, 'mail': 2, 'send': 2, 'remov': 2, 'subject': 2, 'from': 2, 'powerhous': 1, 'gift': 1, 'program': 1, 'miss': 1, 'founder': 1, 'major': 1, 'onc': 1, 'where': 1, 'privat': 1, 'invit': 1, 'expert': 1, 'fastest': 1, 'way': 1, 'huge': 1, 'cash': 1, 'flow': 1, 'ever': 1, 'conceiv': 1, 'leverag': 1, 'into': 1, 'question': 1, 'here': 1, 'either': 1, 'wealthi': 1, 'which': 1, 'am': 1, 'toss': 1, 'financi': 1, 'lifelin': 1, 'sake': 1, 'hope': 1,

In [32]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [33]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.longlong'>'
	with 31 stored elements in Compressed Sparse Row format>

In [34]:
X_few_vectors.toarray()

array([[300,  20,  11,  14,  30,  11,  14,   7,  12,   4,   5],
       [108,   4,   2,   0,   5,   1,   4,   1,   0,   1,   1],
       [270,  17,  13,   7,   2,   5,   1,   5,   2,   7,   6]],
      dtype=int64)

In [35]:
from sklearn.pipeline import Pipeline

In [36]:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

In [37]:
preprocess_pipeline

Pipeline(memory=None,
         steps=[('email_to_wordcount',
                 EmailToWordCounterTransformer(lower_case=True,
                                               remove_punctuation=True,
                                               replace_numbers=True,
                                               replace_urls=True, stemming=True,
                                               strip_headers=True)),
                ('wordcount_to_vector',
                 WordCounterToVectorTransformer(vocabulary_size=1000))],
         verbose=False)

In [38]:
X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [40]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)

In [41]:
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.987, total=   0.2s
[CV]  ................................................................
[CV] .................................... , score=0.986, total=   0.2s
[CV]  ................................................................
[CV] .................................... , score=0.992, total=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s finished


0.9883189803029482

In [42]:
from sklearn.metrics import precision_score, precision_score 

In [43]:
X_test_transformed = preprocess_pipeline.transform(X_test)
X_test_transformed

<4x1001 sparse matrix of type '<class 'numpy.longlong'>'
	with 288 stored elements in Compressed Sparse Row format>

In [44]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf = LogisticRegression(max_iter=1000)
log_clf.fit(X_train_transformed, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
y_pred = log_clf.predict(X_test_transformed)
y_pred

array([0, 0, 0, 0])

NameError: name 'sfsf' is not defined