In [3]:
from os import makedirs, path, remove, rename, rmdir
from tarfile import open as open_tar
from urllib import request, parse


def download_corpus(dataset_dir: str = 'data'):
    base_url = 'https://spamassassin.apache.org'
    corpus_path = 'old/publiccorpus'
    files = {
        '20021010_easy_ham.tar.bz2': 'ham',
        '20021010_hard_ham.tar.bz2': 'ham',
        '20021010_spam.tar.bz2': 'spam',
        '20030228_easy_ham.tar.bz2': 'ham',
        '20030228_easy_ham_2.tar.bz2': 'ham',
        '20030228_hard_ham.tar.bz2': 'ham',
        '20030228_spam.tar.bz2': 'spam',
        '20030228_spam_2.tar.bz2': 'spam',
        '20050311_spam_2.tar.bz2': 'spam' 
    }
    
    downloads_dir = path.join(dataset_dir, 'downloads')
    ham_dir = path.join(dataset_dir, 'ham')
    spam_dir = path.join(dataset_dir, 'spam')


    makedirs(downloads_dir, exist_ok=True)
    makedirs(ham_dir, exist_ok=True)
    makedirs(spam_dir, exist_ok=True)
    
    for file, spam_or_ham in files.items():
        # download file
        url = parse.urljoin(base_url, f'{corpus_path}/{file}')
        tar_filename = path.join(downloads_dir, file)
        request.urlretrieve(url, tar_filename)
        
        # list e-mails in compressed file
        emails = []
        with open_tar(tar_filename) as tar:
            tar.extractall(path=downloads_dir)
            for tarinfo in tar:
                if len(tarinfo.name.split('/')) > 1:
                    emails.append(tarinfo.name)
        
        # move e-mails to ham or spam dir
        for email in emails:
            directory, filename = email.split('/')
            directory = path.join(downloads_dir, directory)
            try:
                rename(path.join(directory, filename), path.join(dataset_dir, spam_or_ham, filename))
            except:
                pass
        
        #rmdir(directory)


download_corpus()

In [5]:
from glob import glob
from os import path
ham_dir = path.join('data', 'ham')
spam_dir = path.join('data', 'spam')
print('hams:', len(glob(f'{ham_dir}/*')))  # hams: 6952
print('spams:', len(glob(f'{spam_dir}/*')))  # spams: 2399

hams: 6951
spams: 2398


In [6]:
from re import sub
class SimpleEmail:
    def __init__(self, subject: str, body: str):
        self.subject = subject
        self.body = body
    
    @property
    def clean(self):
        sanitizer = '[^A-Za-z]+'
        clean = sub(sanitizer, ' ', f'{self.subject} {self.body}')
        clean = clean.lower()
        return sub('\s+', ' ', clean)
    
    def __str__(self):
        subject = f'subject: {self.subject}'
        body_first_line = self.body.split('\n')[0]
        body = f'body: {body_first_line}...'
        return f'{subject}\n{body}'
    def __repr__(self):
        return self.__str__()

In [7]:
from email import message_from_file
from glob import glob
class EmailIterator:
    def __init__(self, directory: str):
        self._files = glob(f'{directory}/*')
        self._pos = 0
    
    def __iter__(self):
        self._pos = -1
        return self
    
    def __next__(self):
        if self._pos < len(self._files) - 1:
            self._pos += 1
            return self.parse_email(self._files[self._pos])
        raise StopIteration()
    
    @staticmethod
    def parse_email(filename: str) -> SimpleEmail:
        with open(filename,
                  encoding='utf-8',
                  errors='replace') as fp:
            message = message_from_file(fp)
        
        subject = None
        for item in message.raw_items():
            if item[0] == 'Subject':
                subject = item[1]
        
        if message.is_multipart():
            body = []
            for b in message.get_payload():
                body.append(str(b))
            body = '\n'.join(body)
        else:
            body = message.get_payload()
        
        return SimpleEmail(subject, body)

In [8]:
import numpy as np
ham_emails = EmailIterator('data/ham')
spam_emails = EmailIterator('data/spam')
hams = np.array([email.clean for email in ham_emails])
spams = np.array([email.clean for email in spam_emails])

In [23]:
import pandas as pd
hams_df = pd.DataFrame(hams)
hams_df['Body'] = hams_df[0]
hams_df['Label'] = 0
hams_df = hams_df.drop([0], axis=1)
hams_df

Unnamed: 0,Body,Label
0,re new sequences window date tue aug from chri...,0
1,re new sequences window date wed aug from chri...,0
2,personal finance resolutions you can keep the ...,0
3,re new sequences window content type text plai...,0
4,zzzzteana re alexander martin a posted tassos...,0
...,...,...
6946,gene technique reveals human evolution url htt...,0
6947,go ahead for new style hospitals url http www ...,0
6948,malicious code hidden in email software url ht...,0
6949,flexible retirement gains ground url http www...,0


In [24]:
import pandas as pd
spams_df = pd.DataFrame(spams)
spams_df['Body'] = spams_df[0]
spams_df['Label'] = 1
spams_df = spams_df.drop([0], axis=1)
spams_df

Unnamed: 0,Body,Label
0,none mv bfc d d b ff cca d b mv f fb c eb efc ...,1
1,ilug stop the mlm insanity greetings you are ...,1
2,life insurance why pay more doctype html publi...,1
3,real protection stun guns free shipping time p...,1
4,ilug guaranteed to lose lbs in days fight the...,1
...,...,...
2393,see your company sales sky rocket there is no ...,1
2394,hit the road with cna content type text plain ...,1
2395,a hour for watching e mmercials no joke html ...,1
2396,make a fortune on ebay html body tr valign d t...,1


In [29]:
emails_df = pd.concat([spams_df, hams_df], ignore_index=True)
emails_df

Unnamed: 0,Body,Label
0,none mv bfc d d b ff cca d b mv f fb c eb efc ...,1
1,ilug stop the mlm insanity greetings you are ...,1
2,life insurance why pay more doctype html publi...,1
3,real protection stun guns free shipping time p...,1
4,ilug guaranteed to lose lbs in days fight the...,1
...,...,...
9344,gene technique reveals human evolution url htt...,0
9345,go ahead for new style hospitals url http www ...,0
9346,malicious code hidden in email software url ht...,0
9347,flexible retirement gains ground url http www...,0


In [30]:
emails_df.to_csv('spam_assassin.csv', index=False)