# Building Spam-Email Classifier

*  Spam classifier classifies spam and non spam emails which will help system administrators to filter emails and block spam emails.

### Step 1- Get the SpamAssassin Dataset from their Website

In [1]:
#Importing necessary libraries
#fetch data
import os
import tarfile
import urllib
#parse email
import email
import email.policy
#Explore emails
from collections import Counter
#preprocessing
import re
from html import unescape
from sklearn.model_selection import train_test_split
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
#model building & evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score,accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
#defining download paths 
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

In [3]:
#define a function to fetch data from spamassassin website
def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    '''This function extracts data from download link to local directory'''
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [4]:
fetch_spam_data()

In [5]:
#load data 
HAM_DIR=os.path.join(SPAM_PATH,'easy_ham')
SPAM_DIR=os.path.join(SPAM_PATH,'spam')
ham_filenames=[name for name in sorted(os.listdir(HAM_DIR)) if len(name)>20]
spam_filenames=[name for name in sorted(os.listdir(SPAM_DIR))if len(name)>20]
               
              

### Step 2 - Parse the Emails

In [6]:
#define a function to parse email
def load_email(is_spam,filename,spam_path=SPAM_PATH):
    directory="spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path,directory,filename),"rb")as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
#get parsed emails as lists
ham_emails=[load_email(is_spam=False,filename=name) for name in ham_filenames]
spam_emails=[load_email(is_spam=True,filename=name) for name in spam_filenames]

In [8]:
print((len(ham_emails),len(spam_emails)))

(2500, 500)


### Step 3 - Explore the Emails

In [9]:
#check content #print a sample
print(ham_emails[2].get_content().strip())

Man Threatens Explosion In Moscow 

Thursday August 22, 2002 1:40 PM
MOSCOW (AP) - Security officers on Thursday seized an unidentified man who
said he was armed with explosives and threatened to blow up his truck in
front of Russia's Federal Security Services headquarters in Moscow, NTV
television reported.
The officers seized an automatic rifle the man was carrying, then the man
got out of the truck and was taken into custody, NTV said. No other details
were immediately available.
The man had demanded talks with high government officials, the Interfax and
ITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to
talk with Russian President Vladimir Putin.
Police and security forces rushed to the Security Service building, within
blocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the
man, who claimed to have one and a half tons of explosives, the news
agencies said. Negotiations continued for about one and a half hours outside
the building, ITAR-

In [10]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [11]:
#check type of email structures# single part/multipart/text/htmletc
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload() #get message objects
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [12]:
#define a function to count different email structures.
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [13]:
ham_count=dict(structures_counter(ham_emails))
ham_count

{'text/plain': 2408,
 'multipart(text/plain, application/pgp-signature)': 66,
 'multipart(text/plain, text/html)': 8,
 'multipart(text/plain, text/enriched)': 1,
 'multipart(text/plain, application/ms-tnef, text/plain)': 1,
 'multipart(text/plain)': 3,
 'multipart(text/plain, application/octet-stream)': 2,
 'multipart(text/plain, text/plain)': 4,
 'multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)': 1,
 'multipart(text/plain, video/mng)': 1,
 'multipart(text/plain, multipart(text/plain))': 1,
 'multipart(text/plain, application/x-pkcs7-signature)': 1,
 'multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)': 1,
 'multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))': 1,
 'multipart(text/plain, application/x-java-applet)': 1}

In [14]:
spam_count=dict(structures_counter(spam_emails))
spam_count

{'text/html': 183,
 'text/plain': 218,
 'multipart(text/plain, application/octet-stream)': 1,
 'multipart(text/html)': 20,
 'multipart(text/plain, text/html)': 45,
 'multipart(text/plain)': 19,
 'multipart(text/html, text/plain)': 1,
 'multipart(text/html, application/octet-stream)': 2,
 'multipart(multipart(text/html))': 5,
 'multipart(text/plain, image/jpeg)': 3,
 'multipart(multipart(text/html), application/octet-stream, image/jpeg)': 1,
 'multipart(multipart(text/plain, text/html), image/gif)': 1,
 'multipart/alternative': 1}

In [15]:
#structures found only in spam emails 
spam_only=set(spam_count.keys())-set(ham_count.keys())
spam_only

{'multipart(multipart(text/html))',
 'multipart(multipart(text/html), application/octet-stream, image/jpeg)',
 'multipart(multipart(text/plain, text/html), image/gif)',
 'multipart(text/html)',
 'multipart(text/html, application/octet-stream)',
 'multipart(text/html, text/plain)',
 'multipart(text/plain, image/jpeg)',
 'multipart/alternative',
 'text/html'}

In [16]:
#structures found in both spam& non spam emails
set(spam_count.keys())&set(ham_count.keys())

{'multipart(text/plain)',
 'multipart(text/plain, application/octet-stream)',
 'multipart(text/plain, text/html)',
 'text/plain'}

### Observations:
* Both classes are having both single part & multipart structured examples.
* A huge part of non-spam emails is single part & text/plain.In 2500 non spam examples,2408(~96%) comes under this category.
* Almost 40% spam email examples(218) also comes under singlepart& text/plain category. This is the most commom type in the entire dataset.
* All the text/html type emails i the dataset are spam. This could be a good indicator in finding spam emails.
* All of the multipart nonspam emails contains text/plane subpart & most of the multipart spam emails contains text/html subpart.
* There are also some structures common to both classes. Now we will build a model to distinguish both classes using algorithms.

### Step 4 - Preprocess the Data

In [87]:
#define dependent & independent variables.
X=np.array(ham_emails+spam_emails)
y=np.array([0]*len(ham_emails)+[1]*len(spam_emails))
print(X.shape,y.shape)

(3000,) (3000,)


In [89]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(2400,) (2400,)
(600,) (600,)


In [91]:
#preprocessing
#html to plane text conversion using regex
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I) #removing htmltags
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)# converts all <a> tags to the word HYPERLINK
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [19]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<HTML><HEAD><TITLE></TITLE><META http-equiv="Content-Type" content="text/html; charset=windows-1252"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content="MSHTML 6.00.2713.1100" name="GENERATOR"></HEAD>
<BODY text="#000000" vLink="#0033ff" link="#0033ff" bgColor="#CCCC99"><TABLE borderColor="#660000" cellSpacing="0" cellPadding="0" border="0" width="100%"><TR><TD bgColor="#CCCC99" valign="top" colspan="2" height="27">
<font size="6" face="Arial, Helvetica, sans-serif" color="#660000">
<b>OTC</b></font></TD></TR><TR><TD height="2" bgcolor="#6a694f">
<font size="5" face="Times New Roman, Times, serif" color="#FFFFFF">
<b>&nbsp;Newsletter</b></font></TD><TD height="2" bgcolor="#6a694f"><div align="right"><font color="#FFFFFF">
<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height="25" colspan="2" bgcolor="#CCCC99"><table width="100%" border="0" 

In [23]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


OTC
 Newsletter
Discover Tomorrow's Winners 
For Immediate Release
Cal-Bay (Stock Symbol: CBYI)
Watch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.
Put CBYI on your watch list, acquire a position TODAY.
REASONS TO INVEST IN CBYI
A profitable company and is on track to beat ALL earnings estimates!
One of the FASTEST growing distributors in environmental & safety equipment instruments.
Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.
RAPIDLY GROWING INDUSTRY
Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billi

In [92]:
#convertig html to plane text
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [97]:
#define email to word counter transformer
url_extractor = None
stemmer = nltk.PorterStemmer()

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [98]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'http': 1, 'www': 1, 'postfun': 1, 'com': 1, 'pfp': 1, 'worboi': 1, 'html': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 're

In [101]:
#define WordCounter To Vector Transformer class.
from scipy.sparse import csr_matrix
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [102]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.intc'>'
	with 20 stored elements in Compressed Sparse Row format>

In [103]:
X_few_vectors.toarray()

array([[  6,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [112,  11,   9,   8,   3,   1,   0,   1,   3,   0,   1],
       [ 92,   0,   1,   2,   3,   4,   5,   3,   1,   4,   2]],
      dtype=int32)

In [106]:
#Create a Pipeline to Transform the Entire Dataset
#Import Pipeline from Scikit-learn:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount",EmailToWordCounterTransformer()),
    ("wordcount_to_vector",WordCounterToVectorTransformer()),
])

In [113]:
X_train_transformed = preprocess_pipeline.fit_transform(X_train)
X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3,n_jobs=-1,verbose=3)
score.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


0.9862500000000001

### Step 5 - Build & Model

In [114]:
#fitting the model.
log_clf = LogisticRegression(random_state=42)
log_clf.fit(X_train_transformed, y_train)
y_pred = log_clf.predict(X_test_transformed)

In [117]:
print("Accuracy: {:.2f}%".format(100 * accuracy_score(y_test, y_pred)))
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Accuracy: 99.33%
Precision: 97.89%
Recall: 97.89%


## Result:
Spam classifier using Logistic Regression gives 97.89% precision which is pretty good value.