In [1]:
import numpy as np
import pandas as pd
import os
import tarfile
import urllib
DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
SPAM_URL = DOWNLOAD_ROOT + '20030228_spam.tar.bz2'
HAM_URL = DOWNLOAD_ROOT + '20030228_easy_ham.tar.bz2'
SPAM_PATH = os.path.join('datasets', 'spam')

def fetch_spam_data(spam_url = SPAM_URL, ham_url = HAM_URL, spam_path = SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    path1 = os.path.join(spam_path, 'ham.tar.bz2')
    path2 = os.path.join(spam_path, 'spam.tar.bz2')
    
    urllib.request.urlretrieve(ham_url, path1)
    tar_file1 = tarfile.open(path1)
    tar_file1.extractall(path = spam_path)
    tar_file1.close()
    
    urllib.request.urlretrieve(spam_url, path2)
    tar_file2 = tarfile.open(path2)
    tar_file2.extractall(path = spam_path)
    tar_file2.close()

In [2]:
fetch_spam_data()

In [3]:
HAM_DIR = os.path.join(SPAM_PATH, 'easy_ham')
SPAM_DIR = os.path.join(SPAM_PATH, 'spam')

In [4]:
ham_files = [filename for filename in os.listdir(HAM_DIR) if len(filename) > 20]
spam_files = [filename for filename in os.listdir(SPAM_DIR) if len(filename) > 20]

In [5]:
import email
import email.policy

def load_email(is_spam, filename, path = SPAM_PATH):
    directionary = 'spam' if is_spam else 'easy_ham'
    with open(os.path.join(path, directionary, filename), 'rb') as f:
        return email.parser.BytesParser(policy = email.policy.default).parse(f)

In [6]:
ham_emails = [load_email(is_spam = False, filename = file) for file in ham_files]

In [7]:
spam_emails = [load_email(is_spam = True, filename = file) for file in spam_files]

In [8]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [9]:
from collections import Counter
def structure_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [10]:
structure_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [11]:
len(ham_emails)

2500

In [12]:
from sklearn.model_selection import train_test_split
train = np.array(ham_emails + spam_emails, dtype = 'object')
test = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

In [13]:
train[0]

<email.message.EmailMessage at 0x21643462d10>

In [14]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit()
for train_index, test_index in split.split(train, test):
    X_train = train[train_index]
    X_test = train[test_index]
    y_train = test[train_index]
    y_test = test[test_index]

In [15]:
y_test.shape

(300,)

In [16]:
import html
import re
from html import unescape
def html_to_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [17]:
html_spam_emails = []
for email in X_train[y_train == 1]:
    if get_email_structure(email) == 'text/html':
        html_spam_emails.append(email)

In [18]:
some_html_spams = html_spam_emails[2]

In [19]:
some_html_spams

<email.message.EmailMessage at 0x21655ba5270>

In [20]:
print(html_to_text(some_html_spams.get_content()))


Ou Wei Lighting, Nights Will Be Lightening!!!
  ��������������������
  ����������
����������������������������������������������������������������������������������������������������������
����������
������������ ������������������������������ ��������������
��������������������������
������������������������������������������������
  ���������������������������������������������������� HYPERLINK http://www.ouweilighting.com
  ����������������������������������
����:0760-2312136
����:0760-2317010
Email:ouwei@ouweilighting.com
����:����������������������������
������
�� ��������
OU WEI LIGHTING, NIGHTS WILL BE LIGHTENED��
Brief Introduction
Since founded in 1996, Ou Wei lighting Co.,ltd. has succeed in manufacturing
  the Projection Lighting, Ceiling&Hanging Lighting, Inlaid Lighting. The
  products have stepped to professionalizing and seriesing development , applying
  to the lighting of the markets, hotels, office buildings, house rooms��engineerings
  and projects.
We are expecting

In [21]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if ctype != 'text/plain' and ctype != 'text/html':
            continue
        try:
            content = part.get_content()
        except:
            content = part.get_payload()
        if ctype == 'text/plain':
            return content
        else:
            html = content
        if html:
            return html_to_text(content)

In [47]:
import urlextract

In [48]:
import nltk
stemmer = nltk.PorterStemmer()
stemmer.stem('computer')

'comput'

In [49]:
from sklearn.base import BaseEstimator, TransformerMixin
class EmailToWordCountTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, strip_header = True, lowercase = True, replace_number = True,
                replace_urls = True, strip_puncuation = True, stemming = True):
        self.strip_header = strip_header
        self.lowercase = lowercase
        self.replace_number = replace_number
        self.replace_urls = replace_urls
        self.strip_puncuation =strip_puncuation
        self.stemming = stemming
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y = None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ''
            
            if self.lowercase:
                text = text.lower()
                
            if self.replace_urls:
                url_extractor = urlextract.URLExtract()
                urls = list(set(url_extractor.find_urls(text)))
                urls = list(set(url_extractor.find_urls(text)))
                for url in urls:
                    text = text.replace(url, "URLs")
                    
            if self.replace_number:
                text = re.sub(r'\d+', 'NUMBER', text, flags = re.M)
                
            if self.strip_puncuation:
                text = re.sub(r'\W+', ' ', text, flags = re.M)
                
            word_counts = Counter(text.split())
            if self.stemming:
                stemming_word_counts = Counter()
                for word, count in word_counts.items():
                    stemming_word_counts[stemmer.stem(word)] += count
                word_counts = stemming_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)
        

In [50]:
from scipy.sparse import csr_matrix
class WordCountToVectorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, vocabulary_limit = 1000):
        self.vocabulary_limit = vocabulary_limit
        
    def fit(self, X, y = None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_limit]
        self.vocabulary_ = {word : index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y = None):
        rows = []
        col = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                col.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, col)), shape = (len(X), self.vocabulary_limit + 1))

In [51]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCountTransformer().fit_transform(X_few)
len(X_few_wordcounts)

3

In [52]:
vocab_transformer = WordCountToVectorTransformer(vocabulary_limit=100)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x101 sparse matrix of type '<class 'numpy.intc'>'
	with 143 stored elements in Compressed Sparse Row format>

In [53]:
X_few_vectors.toarray()[:,0]

array([187,  30,  28], dtype=int32)

In [57]:
from sklearn.pipeline import Pipeline
preprocessing_pipeline = Pipeline([('email_to_wordcount_transformer',
                                    EmailToWordCountTransformer()),
                                  ('wordcount_to_vector_transformer', 
                                   WordCountToVectorTransformer(vocabulary_limit = 1000))])

In [None]:
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression(solver = 'lbfgs', max_iter = 100, random_state = 43)
accuracy = cross_val_score(logistic_reg, X_train_transformed,
                           y_train,cv = 10,  scoring = 'accuracy')

In [None]:
accuracy.mean()

In [None]:

full_pipeline = Pipeline([('preprocessing_pipeline', preprocessing_pipeline),
                         ('logistic_regression', LogisticRegression(random_state = 43))])

full_pipeline.fit(X_train, y_train)

full_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
X_test_transformed = preprocessing_pipeline.transform(X_test)

In [None]:
logistic_reg.fit(X_train_transformed, y_train)
y_pred = logistic_reg.predict(X_test_transformed)

In [None]:
print(f"PRECISON : {precision_score(y_test, y_pred)} \nRECALL : {recall_score(y_test, y_pred)}")

In [None]:
y_pred.sum()

In [None]:
y_test.sum()

In [None]:
accuracy_test = cross_val_score(logistic_reg, X_test_transformed,
                           y_test,cv = 10,  scoring = 'accuracy')

In [None]:
import pickle
pickle.dump(full_pipeline, open('email_spam.pickle', 'wb'))

In [None]:
accuracy_test.mean()

In [None]:
import pickle
pickle.dump()