# Spam Classifier
***Based on the the classifier from Oreilly's Hnads-on Machine Learning with Scikit-Learn and Tensorflow***

In [1]:
from platform import python_version
print(python_version())

3.8.3


In [2]:
import os 
import tarfile
from six.moves import urllib

In [3]:
URL_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = URL_ROOT + "20021010_easy_ham.tar.bz2"
SPAM_URL = URL_ROOT + "20030228_spam.tar.bz2"

DIR_ROOT = "Datasets/spamnham"

In [4]:
def fetch_file(url_root=URL_ROOT, dir_root=DIR_ROOT):
    if not os.path.isdir(DIR_ROOT): 
        os.makedirs(DIR_ROOT)
    if not os.path.isdir(os.path.join(DIR_ROOT, "extracted")):
        os.makedirs(os.path.join(DIR_ROOT, "extracted"))
    for url, filename in ((SPAM_URL, "spam"), (HAM_URL, "ham")):
        path = os.path.join(dir_root, filename)
        if not os.path.isdir(path):
            urllib.request.urlretrieve(url, path)
        tar = tarfile.open(path)
        tar.extractall(path="Datasets/spamnham/extracted")
        tar.close()
    


In [5]:
fetch_file()

In [6]:
import email
import email.policy

def get_email(dir_root=os.path.join(DIR_ROOT, "extracted")):
    pre_ham_files = os.listdir(os.path.join(dir_root, "easy_ham_2"))
    pre_spam_files = os.listdir(os.path.join(dir_root, "spam"))
    ham_files = []
    spam_files = []
    for ham in pre_ham_files: 
        if len(ham)>20:
            ham_files.append(ham)
    for spam in pre_spam_files: 
        if len(spam)>20:
            spam_files.append(spam)
            
    ham_emails=[]
    spam_emails=[]
    
    for file in ham_files:
        filepath = os.path.join(dir_root, "easy_ham_2", file)
        with open(filepath, 'rb') as f: 
            ham_emails.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    
    for file in spam_files:
        filepath = os.path.join(dir_root, "spam", file)
        with open(filepath, 'rb') as f: 
            spam_emails.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    
    return ham_emails, spam_emails
        

In [7]:
ham_emails, spam_emails = get_email()

In [8]:
type(ham_emails[0])

email.message.EmailMessage

In [9]:
print(ham_emails[0].get_content())

On Sun, 21 Jul 2002 10:51:51 -0500
Brian Fahrlander <kilroy@kamakiriad.com> wrote:

> [branching the thread, here]
> 
>     I found another Ximian repository- I don't know if it works yet...
> 
> rpm     http://gstreamer.net/releases/redhat/ redhat-73-i386 deps
> rpm-src http://gstreamer.net/releases/redhat/ redhat-73-i386 deps
> 
> rpm     http://gstreamer.net/releases/redhat/ redhat-73-i386 deps ximian
> rpm-src http://gstreamer.net/releases/redhat/ redhat-73-i386 deps ximian
> 
> rpm     http://gstreamer.net/releases/redhat/ redhat-73-i386 deps gnomehide
> rpm-src http://gstreamer.net/releases/redhat/ redhat-73-i386 deps gnomehide
> 
>     These guys are EXTREMELY apt-friendly.  Unlike most multimedia projects, they seem to prefer RPM/Apt over the older methods.  Isn't that cool?
> 
>     Hey- how would I have known to "apt-get install gnome-session" to kick all this off?
> 
> ------------------------------------------------------------------------
> Brian Fahrländer              Li

In [10]:
def get_structure(email):
    if isinstance(email, str): 
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(",".join([get_structure(sub_email) for sub_email in payload]))
    else: 
        return email.get_content_type()
    

In [11]:
from collections import Counter

def count_structures(emails):
    count = Counter()
    for email in emails:
        structure = get_structure(email)
        count[structure] += 1
    return count

In [12]:
out = count_structures(spam_emails)
out

Counter({'text/plain': 218,
         'multipart(text/plain,text/html)': 45,
         'multipart(text/plain)': 19,
         'text/html': 183,
         'multipart(text/html)': 20,
         'multipart(multipart(text/plain,text/html),image/gif)': 1,
         'multipart(multipart(text/html))': 5,
         'multipart/alternative': 1,
         'multipart(text/plain,image/jpeg)': 3,
         'multipart(text/html,application/octet-stream)': 2,
         'multipart(text/plain,application/octet-stream)': 1,
         'multipart(multipart(text/html),application/octet-stream,image/jpeg)': 1,
         'multipart(text/html,text/plain)': 1})

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array(ham_emails + spam_emails)
y = np.array([0]*len(ham_emails) + [1]*len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 15)

  X = np.array(ham_emails + spam_emails)


In [14]:
from bs4 import BeautifulSoup

def html_to_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.text

In [15]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_text(html)

In [16]:
for email in ham_emails:
    if not type(email_to_text(email)) == 'str':
        print(type(email_to_text(email)))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [17]:
html_emails = []
for email in spam_emails: 
    if get_structure(email) == 'text/html':
        html_emails.append(email)

In [18]:
out = email_to_text(html_emails[10])
print(out)



Toy



ABC's Good Morning America ranks it the #1 Christmas Toy of the season! 
"The new 3-inch mini remote control cars are out of stock everywhere! 
    Parents are searching frantically but having no luck. There are millions of 
    kids expecting these for the Holiday season, lets hope somebody gets them 
    in or Santa may be in trouble!" Dianne Sawyer, Nov 2002
Sold Out in all stores accross the country. Retail price is $59.99. We have 
    limited stock and Free shipping for only $29.95!
Check out this Years Hottest Toy!
 
unsubscribe 
    forever 






# Creating Pipeline

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import re
import urlextract

class EmailToCountTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            url_extractor = urlextract.URLExtract()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            stemmer = nltk.PorterStemmer()
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)
                

In [20]:
from scipy.sparse import csr_matrix
class CountToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [21]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("email to word count", EmailToCountTransformer()),
    ("count to csr_matrix", CountToVectorTransformer())
])

# Training the model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

X = pipeline.fit_transform(X_train)
type(X)

scipy.sparse.csr.csr_matrix

In [23]:
logReg = LogisticRegression(solver ="liblinear", random_state=42)
score = cross_val_score(logReg, X, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.972, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.970, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.974, total=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished


0.972369696449964

In [24]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 98.13%
Recall: 94.59%
