In [8]:
# %reset
import os
import shutil
import tarfile
import urllib.request

# s.replace(os.sep,ntpath.sep)

ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(""))), "machine-learning")
DATASET_P = os.path.join(ROOT, "datasets/spam_classifier")
DESTINATION_P = os.path.join(DATASET_P, "data")
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"

dir_names = {
    "easy_ham": "20021010_easy_ham.tar.bz2",
    "easy_ham2": "20030228_easy_ham.tar.bz2",
    "hard_ham": "20021010_hard_ham.tar.bz2",
    "hard_ham2": "20030228_hard_ham.tar.bz2",
    "spam": "20021010_spam.tar.bz2",
    "spam2": "20030228_spam.tar.bz2",
    "spam3": "20030228_spam_2.tar.bz2",
    "spam4": "20050311_spam_2.tar.bz2",
}

def get_url(file_name: str) -> str:
    return DOWNLOAD_ROOT + file_name

def get_spam_data(urls: dict[str, str], destination_path: str, cleanup: bool = False):
    if not downloaded(destination_path):
        extraction_path = destination_path + "/raw"
        if not os.path.isdir(destination_path):
            os.makedirs(destination_path)
        if not os.path.isdir(extraction_path):
            os.makedirs(extraction_path)
        for key, file_name in urls.items():
            path = os.path.join(extraction_path, key + ".tar.bz2")
            if not os.path.isfile(path):
                urllib.request.urlretrieve(get_url(file_name), path)
            tar_bz_file = tarfile.open(path)
            tar_bz_file.extractall(path=destination_path)
            tar_bz_file.close()
        if cleanup:
            shutil.rmtree(extraction_path)
    else:
        return

def downloaded(path: str) -> bool: 
    # if os.path.isdir(path) and len(os.listdir(path)) > 0:
    #     return True
    return False 

In [9]:
get_spam_data(dir_names, DESTINATION_P, cleanup=False)

In [10]:
import email
import email.policy

def load_email(data_path: str):
    with open(data_path, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

def load_email_to_dict(root_dir: str, dir_names: dict[str, str]) -> dict[str, list[str, email.message.EmailMessage]]:
    ret_dict = {}
    for dir in os.listdir(root_dir):
        if dir != "raw":
            path = root_dir + "/" + dir
            ret_dict[dir] = [load_email(os.path.join(path, name)) for name in [name for name in sorted(os.listdir(path)) if len(name) > 20]]
    return ret_dict


In [11]:
email_dict = load_email_to_dict(DESTINATION_P, dir_names)

In [12]:
print(email_dict["spam_2"][0].get_content().strip())

Greetings!

You are receiving this letter because you have expressed an interest in 
receiving information about online business opportunities. If this is 
erroneous then please accept my most sincere apology. This is a one-time 
mailing, so no removal is necessary.

If you've been burned, betrayed, and back-stabbed by multi-level marketing, 
MLM, then please read this letter. It could be the most important one that 
has ever landed in your Inbox.

MULTI-LEVEL MARKETING IS A HUGE MISTAKE FOR MOST PEOPLE

MLM has failed to deliver on its promises for the past 50 years. The pursuit 
of the "MLM Dream" has cost hundreds of thousands of people their friends, 
their fortunes and their sacred honor. The fact is that MLM is fatally 
flawed, meaning that it CANNOT work for most people.

The companies and the few who earn the big money in MLM are NOT going to 
tell you the real story. FINALLY, there is someone who has the courage to 
cut through the hype and lies and tell the TRUTH about MLM.



In [13]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email
            in payload
        ]))
    else:
        return email.get_content_type()

In [14]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] +=1
    return structures

In [15]:
structures_counter(email_dict["easy_ham"]).most_common()

[('text/plain', 4861),
 ('multipart(text/plain, application/pgp-signature)', 138),
 ('multipart(text/plain, text/html)', 16),
 ('multipart(text/plain, text/plain)', 8),
 ('multipart(text/plain)', 6),
 ('multipart(text/plain, application/octet-stream)', 4),
 ('multipart(text/plain, text/enriched)', 2),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 2),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  2),
 ('multipart(text/plain, video/mng)', 2),
 ('multipart(text/plain, multipart(text/plain))', 2),
 ('multipart(text/plain, application/x-pkcs7-signature)', 2),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  2),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  2),
 ('multipart(text/plain, application/x-java-applet)', 2)]

In [16]:
structures_counter(email_dict["hard_ham"]).most_common()

[('text/html', 238),
 ('text/plain', 167),
 ('multipart(text/plain, text/html)', 81),
 ('multipart(text/html)', 4),
 ('multipart(text/plain, image/bmp)', 2),
 ('multipart(multipart(text/plain, text/html))', 2),
 ('multipart(text/plain, application/x-pkcs7-signature)', 2),
 ('multipart(text/plain, image/png, image/png)', 2),
 ('multipart(multipart(text/plain, text/html), image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/jpeg, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif)',
  1),
 ('multipart(text/plain, text/plain)', 1)]

In [17]:
structures_counter(email_dict["spam"]).most_common()

[('text/plain', 440),
 ('text/html', 364),
 ('multipart(text/plain, text/html)', 90),
 ('multipart(text/html)', 39),
 ('multipart(text/plain)', 38),
 ('multipart(multipart(text/html))', 10),
 ('multipart(text/plain, image/jpeg)', 6),
 ('multipart(text/html, application/octet-stream)', 4),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/html, text/plain)', 2),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 2),
 ('multipart(multipart(text/plain, text/html), image/gif)', 2),
 ('multipart/alternative', 2)]

In [18]:
def print_email_header(email: email.message.EmailMessage):
    for k, v in email.items():
        print(f"{k}: {v}")

In [19]:
test_sub = email_dict["spam_2"][0]
print_email_header(test_sub)

Return-Path: <ilug-admin@linux.ie>
Delivered-To: yyyy@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD	for <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)
Received: from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)
Received: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100
Received: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100
Received: from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100
Received: from 64.0.57.142 [202.63.165.34] by bettyjagessar.com   

In [20]:
test_sub["Subject"]

'[ILUG] STOP THE MLM INSANITY'

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split

def simplify_dict(email_dict: dict, keys_map: dict[str, list[str]]) -> dict[list]:
    ret_dict = {}
    for k, v in keys_map.items():
        if len(v) == 1:
            ret_dict[k] = v
        else:
            concatenated = []
            for sub in v:
                concatenated.extend(email_dict[sub])
            ret_dict[k] = concatenated
    return ret_dict

map_keys = {
    "ham": [k for k in email_dict.keys() if k not in ["spam", "spam_2"]],
    "spam": ["spam", "spam_2"]
}
d = simplify_dict(email_dict, map_keys)

X = np.array(d["ham"] + d["spam"], dtype=object)
y = np.array([0] * len(d["ham"]) + [1] * len(d["spam"]))

X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

  text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)


In [23]:
html_spam_emails = [email for email in X_tr[y_tr == 1] if get_email_structure(email) == "text/html"]
html_spam_emails

[<email.message.EmailMessage at 0x10ff6e480>,
 <email.message.EmailMessage at 0x11e88d9d0>,
 <email.message.EmailMessage at 0x11ee238c0>,
 <email.message.EmailMessage at 0x11f5630b0>,
 <email.message.EmailMessage at 0x11f4b8440>,
 <email.message.EmailMessage at 0x11f140d70>,
 <email.message.EmailMessage at 0x11f465c40>,
 <email.message.EmailMessage at 0x10ffd2ea0>,
 <email.message.EmailMessage at 0x11f4674d0>,
 <email.message.EmailMessage at 0x11f467e30>,
 <email.message.EmailMessage at 0x10ffd16d0>,
 <email.message.EmailMessage at 0x11c1fe7e0>,
 <email.message.EmailMessage at 0x10fde34d0>,
 <email.message.EmailMessage at 0x10fef9be0>,
 <email.message.EmailMessage at 0x11f486960>,
 <email.message.EmailMessage at 0x11e827650>,
 <email.message.EmailMessage at 0x11c09a7e0>,
 <email.message.EmailMessage at 0x11c007a40>,
 <email.message.EmailMessage at 0x11c5a3e00>,
 <email.message.EmailMessage at 0x11c02f3b0>,
 <email.message.EmailMessage at 0x11ee16d80>,
 <email.message.EmailMessage at 0x

In [24]:
sample = html_spam_emails[5]
print(sample.get_content().strip()[:1000])

<html><head><meta http-equiv=Content-Language content=tr><meta http-equiv=Content-Type content="text/html; charset=windows-1254"><title>TR Rehber 6</title></head>
<body background="http://xyzlm22.sitemynet.com/back1.gif>" topmargin=1 leftmargin=1 bgcolor="#CECFFF">
      <p align="center">
      <img border="2" src="http://xyzlm22.sitemynet.com/banner1.gif" align="center" width="610" height="81"><div align="center">
  <center>
  <table border=1 cellspacing=1 style="border-collapse: collapse" bordercolor="#111111" width=612 id=AutoNumber1 height=77 bgcolor="#CECFFF" background="http://xyzlm22.sitemynet.com/back1.gif"><tr>
    <td width=606 height=16 bgcolor="#00FFFF"><p align=center><b><font face=Arial color="#FF0000"> <marquee scrolldelay=100 scrollamount=9>REKLAMLARINIZA SERVET ÖDEMEYiN!!! UCUZ, KOLAY VE KALICI REKLAMLARINIZ iCiN BiZi ARAYINIZ...</marquee></font></b></td></tr><tr>
      <td width=606 height=43 bgcolor="#00FFFF"><p align=center><font face=Tahoma size=2 color="#000000">

In [25]:
print(html_to_plain_text(sample.get_content())[:1000], "...")


" topmargin=1 leftmargin=1 bgcolor="#CECFFF">
     REKLAMLARINIZA SERVET ÖDEMEYiN!!! UCUZ, KOLAY VE KALICI REKLAMLARINIZ iCiN BiZi ARAYINIZ...
    Bu ileti size islerinizi kolaylastirmak, satislarinizi arttirmak, kisacasi
  milyonlara sesinizi duyurmak icin gönderilmistir. Bu türden tanitimlarla
  ilgilenmiyorsaniz bu iletiyi, "ilgilenecegini düsündügünüz" tanidiklariniza
  gönderiniz. Bu iletinin size ve tanidiklariniza kazandiracaklarina
  inanamayacaksiniz!.Herseye ragmen bizden e-mail almak istemiyor ve satisa sundugumuz listelerden cikmak istiyorsaniz
  bu iletiyi bos olarak cevaplamaniz yeterli olacaktir.
  Tüm önerilerinizi dikkate alip degerlendiriyoruz.. Lütfen olumlu olumsuz
  elestirileriniz icin web sayfamizdaki iletisim formunu kullaniniz..
 
      TR Rehber 8.0 Güncellenme Tarihi 10 Temmuz 2002'dir.. TR Rehber ve World Rehberi aldiginizda bir sonraki güncellemede  herhangi bir ücret ödemeyeceksiniz.. Daha önceden alis-veris de bulundugumuz müsterilerimize %50 indirim imk

In [26]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = email.get_content_type()
        if ctype not in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content 
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [27]:
from nltk import PorterStemmer

stemmer = PorterStemmer()

In [28]:
import urlextract
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            strip_headers=True,
            lower_case=True,
            replace_urls=True,
            replace_numbers=True,
            remove_punctuation=True,
            stemming=True,
            url_extractor=None,
            stemmer=None
    ) -> None:
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_punctuation = remove_punctuation
        self.stemming = stemming

        self.url_extractor = url_extractor or urlextract.URLExtract()
        self.stemmer = stemmer or PorterStemmer()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and self.url_extractor is None:
                urls=list(set(self.url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?", "NUMBER", text)
            if self.remove_punctuation:
                text = re.sub(r"\W+", " ", text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and self.stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = self.stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)


In [29]:
X_try = X_tr[:3]
X_try_wc = EmailToWordCounterTransformer().fit_transform(X_try)
X_try_wc

array([Counter({'and': 10, 'number': 8, 'of': 7, 'agent': 6, 'the': 6, 'to': 6, 'in': 6, 'activebuddi': 5, 'interact': 5, 'technolog': 4, 'a': 4, 'for': 4, 'messag': 4, 'is': 4, 'smarterchild': 4, 'say': 4, 'seattl': 4, 'buddyscript': 3, 'an': 3, 'with': 3, 'server': 3, 'weather': 3, 'wa': 3, 'need': 2, 'product': 2, 'deploy': 2, 's': 2, 'solut': 2, 'provid': 2, 'custom': 2, 'constitu': 2, 'by': 2, 'it': 2, 'your': 2, 'convers': 2, 'softwar': 2, 'end': 2, 'mike': 2, 'you': 2, 'current': 2, 'condit': 2, 'f': 2, 'just': 2, 'type': 2, 'forecast': 2, 'kind': 1, 'interest': 1, 'applic': 1, 'mix': 1, 'telnet': 1, 'eliza': 1, 'wonder': 1, 'if': 1, 'knownow': 1, 'thi': 1, 'as': 1, 'partner': 1, 'http': 1, 'www': 1, 'com': 1, 'index': 1, 'shtml': 1, 'inc': 1, 'ha': 1, 'develop': 1, 'patent': 1, 'build': 1, 'power': 1, 'busi': 1, 'that': 1, 'leverag': 1, 'function': 1, 'conveni': 1, 'appeal': 1, 'instant': 1, 'effici': 1, 'immedi': 1, 'mean': 1, 'commun': 1, 'proven': 1, 'marketplac': 1, 'smash'

In [30]:
from scipy.sparse import csr_matrix

class WordCountToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000) -> None:
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        # basically build the vocabulary based on the most common words in the attribute dataset
        # can experiment with other methods such as tf-idf
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = { word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size+1))

In [31]:
vocab_transformer = WordCountToVectorTransformer(vocabulary_size=10)
X_try_vc = vocab_transformer.fit_transform(X_try_wc)
X_try_vc

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 960 stored elements in Compressed Sparse Row format>

In [32]:
X_try_vc.toarray()

array([[ 206,    6,    6,   10,    8,    6,    7,    4,    4,    1,    4],
       [ 164,   12,   11,    4,    6,    7,    5,    8,    2,    4,    1],
       [1901,  109,   34,   48,  176,   70,   54,   38,   23,   23,   34]])

In [33]:
vocab_transformer.vocabulary_

{'the': 1,
 'in': 2,
 'and': 3,
 'number': 4,
 'to': 5,
 'of': 6,
 'a': 7,
 'for': 8,
 'that': 9,
 'is': 10}

In [34]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCountToVectorTransformer(vocabulary_size=10000))
])

X_tr_transformed = preprocess_pipeline.fit_transform(X_tr)

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_tr_transformed, y_tr, cv=3, verbose=3)

score.mean()

[CV] END ................................ score: (test=0.948) total time=   1.0s
[CV] END ................................ score: (test=0.958) total time=   1.8s
[CV] END ................................ score: (test=0.960) total time=   0.9s


0.9554969117389537

In [36]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

X_tst_transformed = preprocess_pipeline.fit_transform(X_tst)
log_clf.fit(X_tr_transformed, y_tr)
y_pred = log_clf.predict(X_tst_transformed)

print(f"F1 Score: {f1_score(y_tst, y_pred)}")
print(f"Precision: {precision_score(y_tst, y_pred)}")
print(f"Recall: {recall_score(y_tst, y_pred)}")
print(f"Accuracy: {accuracy_score(y_tst, y_pred)}")

F1 Score: 0.495364238410596
Precision: 0.6071428571428571
Recall: 0.41834451901565994
Accuracy: 0.7603773584905661


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [i for i in range(10, 100)]
}

grid_cv = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
grid_cv.fit(X_tr_transformed, y_tr)
y_grid_pred = grid_cv.predict(X_tst_transformed)

In [38]:
print(f"F1 Score: {f1_score(y_tst, y_grid_pred)}")
print(f"Precision: {precision_score(y_tst, y_grid_pred)}")
print(f"Recall: {recall_score(y_tst, y_grid_pred)}")
print(f"Accuracy: {accuracy_score(y_tst, y_grid_pred)}")

F1 Score: 0.36661698956780925
Precision: 0.5491071428571429
Recall: 0.2751677852348993
Accuracy: 0.7327044025157232
