# My Spam Classifier

In [1]:
from platform import python_version
print(python_version())

3.8.3


In [2]:
import os 
import tarfile
from six.moves import urllib

In [3]:
URL_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = URL_ROOT + "20030228_easy_ham_2.tar.bz2"
SPAM_URL = URL_ROOT + "20030228_spam.tar.bz2"

DIR_ROOT = "Datasets/spamnham"

In [4]:
def fetch_file(url_root=URL_ROOT, dir_root=DIR_ROOT):
    if not os.path.isdir(DIR_ROOT): 
        os.makedirs(DIR_ROOT)
    if not os.path.isdir(os.path.join(DIR_ROOT, "extracted")):
        os.makedirs(os.path.join(DIR_ROOT, "extracted"))
    for url, filename in ((SPAM_URL, "spam"), (HAM_URL, "ham")):
        path = os.path.join(dir_root, filename)
        if not os.path.isdir(path):
            urllib.request.urlretrieve(url, path)
        tar = tarfile.open(path)
        tar.extractall(path="Datasets/spamnham/extracted")
        tar.close()
    


In [5]:
fetch_file()

In [6]:
import email
import email.policy

def get_email(dir_root=os.path.join(DIR_ROOT, "extracted")):
    pre_ham_files = os.listdir(os.path.join(dir_root, "easy_ham_2"))
    pre_spam_files = os.listdir(os.path.join(dir_root, "spam"))
    ham_files = []
    spam_files = []
    for ham in pre_ham_files: 
        if len(ham)>20:
            ham_files.append(ham)
    for spam in pre_spam_files: 
        if len(spam)>20:
            spam_files.append(spam)
            
    ham_emails=[]
    spam_emails=[]
    
    for file in ham_files:
        filepath = os.path.join(dir_root, "easy_ham_2", file)
        with open(filepath, 'rb') as f: 
            ham_emails.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    
    for file in spam_files:
        filepath = os.path.join(dir_root, "spam", file)
        with open(filepath, 'rb') as f: 
            spam_emails.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    
    return ham_emails, spam_emails
        

In [7]:
ham_emails, spam_emails = get_email()

In [8]:
type(ham_emails[0])

email.message.EmailMessage

In [9]:
print(ham_emails[0].get_content())

On Sun, 21 Jul 2002 10:51:51 -0500
Brian Fahrlander <kilroy@kamakiriad.com> wrote:

> [branching the thread, here]
> 
>     I found another Ximian repository- I don't know if it works yet...
> 
> rpm     http://gstreamer.net/releases/redhat/ redhat-73-i386 deps
> rpm-src http://gstreamer.net/releases/redhat/ redhat-73-i386 deps
> 
> rpm     http://gstreamer.net/releases/redhat/ redhat-73-i386 deps ximian
> rpm-src http://gstreamer.net/releases/redhat/ redhat-73-i386 deps ximian
> 
> rpm     http://gstreamer.net/releases/redhat/ redhat-73-i386 deps gnomehide
> rpm-src http://gstreamer.net/releases/redhat/ redhat-73-i386 deps gnomehide
> 
>     These guys are EXTREMELY apt-friendly.  Unlike most multimedia projects, they seem to prefer RPM/Apt over the older methods.  Isn't that cool?
> 
>     Hey- how would I have known to "apt-get install gnome-session" to kick all this off?
> 
> ------------------------------------------------------------------------
> Brian Fahrländer              Li

In [10]:
def get_structure(email):
    if isinstance(email, str): 
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(",".join([get_structure(sub_email) for sub_email in payload]))
    else: 
        return email.get_content_type()
    

In [11]:
from collections import Counter

def count_structures(emails):
    count = Counter()
    for email in emails:
        structure = get_structure(email)
        count[structure] += 1
    return count

In [12]:
out = count_structures(spam_emails)
out

Counter({'text/plain': 218,
         'multipart(text/plain,text/html)': 45,
         'multipart(text/plain)': 19,
         'text/html': 183,
         'multipart(text/html)': 20,
         'multipart(multipart(text/plain,text/html),image/gif)': 1,
         'multipart(multipart(text/html))': 5,
         'multipart/alternative': 1,
         'multipart(text/plain,image/jpeg)': 3,
         'multipart(text/html,application/octet-stream)': 2,
         'multipart(text/plain,application/octet-stream)': 1,
         'multipart(multipart(text/html),application/octet-stream,image/jpeg)': 1,
         'multipart(text/html,text/plain)': 1})

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array(ham_emails + spam_emails)
y = np.array([0]*len(ham_emails) + [1]*len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 15)

  X = np.array(ham_emails + spam_emails)


In [14]:
from bs4 import BeautifulSoup

def html_to_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.text

In [15]:
def email_to_text(email):
    if get_structure(email) == 'text/plain':
        return email.get_content()
    if get_structure(email) == 'text/html':
        soup = html_to_text(email.get_content())
        return soup
    else: 
        return str(email.get_content())


In [16]:
out = [get_structure(email) for email in spam_emails]

In [17]:
html_emails = []
for email in spam_emails: 
    if get_structure(email) == 'text/html':
        html_emails.append(email)

In [18]:
out = email_to_text(html_emails[10])
print(out)



Toy



ABC's Good Morning America ranks it the #1 Christmas Toy of the season! 
"The new 3-inch mini remote control cars are out of stock everywhere! 
    Parents are searching frantically but having no luck. There are millions of 
    kids expecting these for the Holiday season, lets hope somebody gets them 
    in or Santa may be in trouble!" Dianne Sawyer, Nov 2002
Sold Out in all stores accross the country. Retail price is $59.99. We have 
    limited stock and Free shipping for only $29.95!
Check out this Years Hottest Toy!
 
unsubscribe 
    forever 






# Creating Pipeline

In [53]:
from sklearn.base import BaseEstimator, TransformerMixin
import nltk

class EmailToCountTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, remove_header=True, remove_punctuation=True, remove_url=True, stemmer=True): 
        self.remove_header = remove_header
        self.remove_punctuation = remove_puncuation
        self.remove_url = remove_url
        self.stemmer = stemmer
        stemmer = nltk.PorterStemmer()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for email in X: 
            if remove_header:
                fields = email.keys()
                for field in fields: 
                    email.__delitem__(field)
            text = email_to_text(email)
            if remove_url:

In [51]:
some_email = ham_emails[2]
print(some_email)

Delivered-To:  
Received:  
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Wed, 14 Aug 2002 10:52:12 +0100 (IST)
Received: from xent.com ([64.161.22.236]) by dogma.slashnull.org
    (8.11.6/8.11.6) with ESMTP id g7DJL0415153 for <jm@jmason.org>;
    Tue, 13 Aug 2002 20:21:01 +0100
Received: from lair.xent.com (localhost [127.0.0.1]) by xent.com (Postfix)
    with ESMTP id 6F2E32940A6; Tue, 13 Aug 2002 12:19:06 -0700 (PDT)
Delivered-To: fork@spamassassin.taint.org
Received: from Boron.MeepZor.Com (i.meepzor.com [204.146.167.214]) by
    xent.com (Postfix) with ESMTP id 263E72940A5 for <FoRK@Xent.Com>;
    Tue, 13 Aug 2002 12:18:17 -0700 (PDT)
Received: from Golux.Com (dmz-firewall [206.199.198.4]) by
    Boron.MeepZor.Com (8.11.6/8.11.6) with ESMTP id g7DJJKR20997;
    Tue, 13 Aug 2002 15:19:20 -0400
Message-Id:  
From:  
Organization:  
X-Mailer:  
X-Accept-Language:  
MIME-Version:  
To:  
Subject:  
Content-Type:  
Content

In [37]:
some_email.keys()

['Delivered-To',
 'Received',
 'Received',
 'Received',
 'Received',
 'Delivered-To',
 'Received',
 'Received',
 'Message-Id',
 'From',
 'Organization',
 'X-Mailer',
 'X-Accept-Language',
 'MIME-Version',
 'To',
 'Subject',
 'Content-Type',
 'Content-Transfer-Encoding',
 'Sender',
 'Errors-To',
 'X-Beenthere',
 'X-Mailman-Version',
 'Precedence',
 'List-Help',
 'List-Post',
 'List-Subscribe',
 'List-Id',
 'List-Unsubscribe',
 'List-Archive',
 'Date']

In [52]:
for part in some_email.keys():
    some_email.__delitem__(part)
print(some_email)


I'm not up to forking the text, but for your entertainment:

http://www.kanga.nu/~claw/bug_count.html
-- 
#ken	P-)}

Ken Coar, Sanagendamgagwedweinini  http://Golux.Com/coar/
Author, developer, opinionist      http://Apache-Server.Com/

"Millennium hand and shrimp!"
http://xent.com/mailman/listinfo/fork



