In [5]:
import tarfile
import urllib
import os
from pathlib import Path
import email
import email.policy
from collections import Counter

# Extract the Data

In [6]:
def fetch_data():
    root_url="http://spamassassin.apache.org/old/publiccorpus/"
    ham_url=root_url+"20030228_easy_ham.tar.bz2"
    spam_url=root_url+"20030228_spam.tar.bz2"

    spam_path=Path()/"data"/"spam"
    spam_path.mkdir(parents=True, exist_ok=True)

    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url), 
                                    ("spam", "spam", spam_url)):
        if not (spam_path/dir_name).is_dir():
            path=(spam_path/tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)

            tar_file=tarfile.open(path)
            tar_file.extractall(path=spam_path)
            tar_file.close()

    return [spam_path/dir_name for dir_name in ("easy_ham", "spam")]

In [7]:

ham_dir, spam_dir=fetch_data()

ham_filenames=[f for f in sorted(ham_dir.iterdir()) if len(f.name)>20]
spam_filenames=[f for f in sorted(spam_dir.iterdir()) if len(f.name)>20]

In [8]:
len(ham_filenames)

2500

In [9]:
len(spam_filenames)

500

In [10]:
def load_email(filepath):
    with open(filepath, "rb") as file:
        return email.parser.BytesParser(policy=email.policy.default).parse(file)

In [11]:
ham_emails=[load_email(filepath) for filepath in ham_filenames]
spam_emails=[load_email(filepath) for filepath in spam_filenames]

In [12]:
print(ham_emails[5].get_content().strip())

> I just had to jump in here as Carbonara is one of my favourites to make and 
> ask 
> what the hell are you supposed to use instead of cream? 

Isn't it just basically a mixture of beaten egg and bacon (or pancetta, 
really)? You mix in the raw egg to the cooked pasta and the heat of the pasta 
cooks the egg. That's my understanding.

Martin

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [13]:
print(spam_emails[5].get_content().strip())

A POWERHOUSE GIFTING PROGRAM You Don't Want To Miss! 
 
  GET IN WITH THE FOUNDERS! 
The MAJOR PLAYERS are on This ONE
For ONCE be where the PlayerS are
This is YOUR Private Invitation

EXPERTS ARE CALLING THIS THE FASTEST WAY 
TO HUGE CASH FLOW EVER CONCEIVED
Leverage $1,000 into $50,000 Over and Over Again

THE QUESTION HERE IS:
YOU EITHER WANT TO BE WEALTHY 
OR YOU DON'T!!!
WHICH ONE ARE YOU?
I am tossing you a financial lifeline and for your sake I 
Hope you GRAB onto it and hold on tight For the Ride of youR life!

Testimonials

Hear what average people are doing their first few days:
�We've received 8,000 in 1 day and we are doing that over and over again!' Q.S. in AL
 �I'm a single mother in FL and I've received 12,000 in the last 4 days.� D. S. in FL
�I was not sure about this when I sent off my $1,000 pledge, but I got back $2,000 the very next day!� L.L. in KY
�I didn't have the money, so I found myself a partner to work this with. We have received $4,000 over the last 2 days

In [14]:
def get_email_stracture(email):
    if isinstance(email, str):
        return email
    payload=email.get_payload()

    if isinstance(payload, list):
        multipart=', '.join([get_email_stracture(sub_email) for sub_email in payload])
        return f'multipart({multipart})'
    else:
        return email.get_content_type()

In [15]:
def stractures_counter(emails):
    stractures = Counter()
    for email in emails:
        starcture=get_email_stracture(email)
        stractures[starcture] += 1
    return stractures

In [16]:
stractures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [17]:
stractures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

# Split the Data

In [36]:
import numpy as np
from sklearn.model_selection import train_test_split

import re
from html import unescape

import nltk
import urlextract

In [21]:
X=np.array(ham_emails+spam_emails, dtype=object)
y=np.array([0]*len(ham_emails) + [1]*len(spam_emails))

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
def html_to_text(html):
    text=re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text=re.sub('<a\s.*?>', 'HYPERLINK', text, flags=re.M | re.S | re.I)
    text=re.sub('<.*?>', '', text, flags=re.M | re.S)
    text=re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [31]:
def email_to_text(email):
    html=None
    for part in email.walk():
        ctype=part.get_content_type()
        if not ctype in ('text/plain', 'text/html'):
            continue
        try:
            content=part.get_content()
        except:
            content=str(part.get_payload())
        if ctype=='text/plain':
            return content
        else:
            html=content
    if html:
        return html_to_text(html)

In [41]:
stemmer=nltk.PorterStemmer()
url_extractor=urlextract.URLExtract()

# Preprocessing PipeLine

### Creating Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix

In [42]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_header=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_header=strip_header
        self.lower_case=lower_case
        self.remove_punctuation=remove_punctuation
        self.replace_urls=replace_urls
        self.replace_numbers=replace_numbers
        self.stemming=stemming
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed=[]
        for email in X:
            text =email_to_text(email) or ''

            if self.lower_case:
                text=text.lower()
            if self.replace_urls and url_extractor is not None:
                urls=list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text=text.replace(url, " URL ")
            if self.replace_numbers:
                text=re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts=Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts=Counter()
                for word, count in word_counts.items():
                    stemmed_word=stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts=stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)    


In [45]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size=vocabulary_size

    def fit(self, X, y=None):
        total_count=Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common=total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_={word: index+1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        rows=[]
        columns=[]
        data=[]
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                columns.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, columns)), shape=(len(X), self.vocabulary_size+1))

### Creating the PipeLine

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score

In [49]:
preprocess_pipeline=Pipeline([
    ('email_to_wordcount', EmailToWordCounterTransformer()),
    ('wordcount_to_vector', WordCounterToVectorTransformer())
])

X_train_transformed=preprocess_pipeline.fit_transform(X_train)

In [51]:
log_clf=LogisticRegression(max_iter=1000, random_state=42)
score=cross_val_score(log_clf, X_train_transformed, y_train, cv=3)
print(score)
print(score.mean())

[0.98125 0.98125 0.99125]
0.9845833333333333


In [52]:
X_test_transformed=preprocess_pipeline.transform(X_test)

log_clf=LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred=log_clf.predict(X_test_transformed)

print(f'Precision: {precision_score(y_test, y_pred):.2%}')
print(f'Recall: {recall_score(y_test, y_pred):.2%}')

Precision: 94.90%
Recall: 97.89%
