In [335]:
"""
Shows SpamAssassin Python Wrapper at work.

https://pypi.org/project/spamassassin-client/


- SETUP: -
1) Have spamasassin installed
    > sudo apt get spamassassin
2) > pip install spamassassin_client
3) Start spamassassin server
    > sudo spamd
4) Use spamassassin_client

- NOTE: -
REMEMBER TO SHUT DOWN SPAMD SERVER WHEN DONE

"""
import os
from spamassassin_client import SpamAssassin
import pandas as pd
import numpy as np
import util
import re

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Forming any text into valid email including headers and signatures
This is so we can ignore pesky SpamAssassin header and signature rules

In [336]:
lingData = pd.read_csv('data/csvs/lingSpam.csv')
enronData = pd.read_csv('data/csvs/enronSpam.csv')

minRows = min(lingData.shape[0], enronData.shape[0])
print(minRows)
print(lingData.shape[0], enronData.shape[0])

# shrink datasets 
enronData = enronData[:minRows]
print(enronData.shape)
lingData = lingData[:minRows]
print(lingData.shape)

2605
2605 10000
(2605, 4)
(2605, 3)


In [337]:
# combine datasets
combinedData = pd.concat([lingData, enronData]).loc[:, ['Body', 'Label']].reset_index(drop=True)
# rename columns 
combinedData = combinedData.rename(columns={'Body': 'body', 'Label': 'label'}).sample(frac=1).reset_index(drop=True)
combinedData


Unnamed: 0,body,label
0,Subject: the fastest way to get a loan\n we ca...,1
1,Subject: back to happy and healthy life . . .\...,1
2,"Subject: you would , would you ?\n computer ho...",1
3,Subject: free live sex ! ! !\n \n * * * * * * ...,1
4,Subject: toshiba refurbished notebooks exports...,1
...,...,...
5205,Subject: secrets of the noveau rich\n \n you h...,1
5206,"Subject: generic pharmacy 8\n dear reader ,\n ...",1
5207,Subject: keep your home safe\n u . s . homeown...,1
5208,"Subject: perfect logo charset = koi 8 - r "" >\...",1


In [338]:
# get equal number of spam and ham
classSize = 1000
spam = combinedData[combinedData.label == 1][:classSize]
ham = combinedData[combinedData.label == 0][:classSize]
print(spam.shape, ham.shape)
combinedData = pd.concat([spam, ham]).sample(frac=1).reset_index(drop=True)
combinedData.shape

(1000, 2) (1000, 2)


(2000, 2)

In [339]:
msgs = combinedData.loc[:, 'body']
labels = combinedData.loc[:, 'label']
msgs, labels

(0       Subject: re : here is our opportunity\n nostal...
 1       Subject: book review ( cheng ) : on the typolo...
 2       Subject: sle 98 st . andrews\n \n invitation t...
 3       Subject: favour\n attn . : russian investment ...
 4       Subject: causal connectives have presuppositio...
                               ...                        
 1995    Subject: [ n + v ] verbal compounding\n \n con...
 1996    Subject: the bible is a proven hoax\n click he...
 1997    Subject: homophones in english dialects\n \n i...
 1998    Subject: don ' t get ripped off ! things to wa...
 1999    Subject: leading in affordable healthcare . . ...
 Name: body, Length: 2000, dtype: object,
 0       1
 1       0
 2       0
 3       1
 4       0
        ..
 1995    0
 1996    1
 1997    0
 1998    1
 1999    1
 Name: label, Length: 2000, dtype: int64)

In [340]:
# extract subject from body
def extractSubjectBody(text):
    spl = text.split('\n')
    line1 = spl[0]
    subjmatch = re.match('^subject:(.*)', line1, re.IGNORECASE)
    if subjmatch:
        subject = subjmatch.group(1).strip()
        body = '\n'.join(spl[1:]).strip()
        return subject, body
    else:
        return None, text.strip()
    return None

In [341]:
CLEAN_HEADER = """Subject: {subject}
Message-ID: <GTUBE1.1010101@example.net>
Date: Wed, 23 Jul 2003 23:30:00 +0200
From: Sender <sender@example.net>
To: Recipient <recipient@example.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit"""

def add_clean_header(text):
    subj, body = extractSubjectBody(text)
    if subj is None:
        head = CLEAN_HEADER.format(subject="Placeholder Subject")
    else:
        head = CLEAN_HEADER.format(subject=subj)
    return head + '\n\n' + body

In [349]:
# we could just take emails we classify correctly here and use them to run our experiments
from sklearn import metrics

def run_spam_assassin(msgs, labels, lvl=3):
    predictions = []
    actual = []
    for m, l in zip(msgs, labels):
        m = add_clean_header(m)
        sa = SpamAssassin(bytes(m, 'utf-8'))
        if sa.is_spam(level=lvl):
            pred = 1
        else:
            pred = 0
        predictions.append(pred)
        actual.append(l)
    return predictions, actual