In [1]:
"""
Shows SpamAssassin Python Wrapper at work.

https://pypi.org/project/spamassassin-client/


- SETUP: -
1) Have spamasassin installed
    > sudo apt get spamassassin
2) > pip install spamassassin_client
3) Start spamassassin server
    > sudo spamd
4) Use spamassassin_client

- NOTE: -
REMEMBER TO SHUT DOWN SPAMD SERVER WHEN DONE

"""
import os
from spamassassin_client import SpamAssassin
import pandas as pd
import numpy as np
import util
import re

%load_ext autoreload
%autoreload 2

### Forming any text into valid email including headers and signatures
This is so we can ignore pesky SpamAssassin header and signature rules

In [2]:
lingData = pd.read_csv('data/csvs/lingSpam.csv')
enronData = pd.read_csv('data/csvs/enronSpam.csv')

minRows = min(lingData.shape[0], enronData.shape[0])
print(minRows)
print(lingData.shape[0], enronData.shape[0])

# shrink datasets 
enronData = enronData[:minRows]
print(enronData.shape)
lingData = lingData[:minRows]
print(lingData.shape)

2605
2605 10000
(2605, 4)
(2605, 3)


In [3]:
# combine datasets
combinedData = pd.concat([lingData, enronData]).loc[:, ['Body', 'Label']].reset_index(drop=True)
# rename columns 
combinedData = combinedData.rename(columns={'Body': 'body', 'Label': 'label'}).sample(frac=1).reset_index(drop=True)
combinedData


Unnamed: 0,body,label
0,"Subject: get up to $ 353 , 327\n are you ready...",1
1,"Subject: canadian english\n \n hello , i would...",0
2,"Subject: avail for review : syntax , chomsky\n...",0
3,Subject: \n html\n body\n pfont face = arialbu...,1
4,Subject: fortune award winning final notificat...,1
...,...,...
5205,"Subject: q : "" english only ""\n \n content - l...",0
5206,Subject: sociolinguistics symposium\n \n remin...,0
5207,Subject: here is how to send your own bulk ema...,1
5208,Subject: language resources & evaluation works...,0


In [4]:
# get equal number of spam and ham
classSize = 1000
spam = combinedData[combinedData.label == 1][:classSize]
ham = combinedData[combinedData.label == 0][:classSize]
print(spam.shape, ham.shape)
combinedData = pd.concat([spam, ham]).sample(frac=1).reset_index(drop=True)
combinedData.shape

(1000, 2) (1000, 2)


(2000, 2)

In [5]:
msgs = combinedData.loc[:, 'body']
labels = combinedData.loc[:, 'label']
msgs, labels

(0       Subject: strengthen your marriage or relations...
 1       Subject: re :\n for\n immediate release\n cal ...
 2       Subject: 3 . 405 languages , citation\n \n let...
 3       Subject: language and legislation conference\n...
 4         Subject: women , something to rock your world\n
                               ...                        
 1995    Subject: re : 6 . 959 , disc : he / she\n \n m...
 1996    Subject: defend yourself against criminals\n \...
 1997    Subject: limited time reservation .\n \n the m...
 1998    Subject: the reason to shop online\n the inter...
 1999    Subject: here ' s a hot piay in motion\n " sto...
 Name: body, Length: 2000, dtype: object,
 0       1
 1       1
 2       0
 3       0
 4       1
        ..
 1995    0
 1996    1
 1997    1
 1998    1
 1999    1
 Name: label, Length: 2000, dtype: int64)

In [6]:
# extract subject from body
def extractSubjectBody(text):
    spl = text.split('\n')
    line1 = spl[0]
    subjmatch = re.match('^subject:(.*)', line1, re.IGNORECASE)
    if subjmatch:
        subject = subjmatch.group(1).strip()
        body = '\n'.join(spl[1:]).strip()
        return subject, body
    return None, text.strip()

In [7]:
CLEAN_HEADER = """Subject: {subject}
Message-ID: <GTUBE1.1010101@example.net>
Date: Wed, 23 Jul 2003 23:30:00 +0200
From: Sender <sender@example.net>
To: Recipient <recipient@example.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit"""

def add_clean_header(text):
    subj, body = extractSubjectBody(text)
    if subj is None:
        head = CLEAN_HEADER.format(subject="Placeholder Subject")
    else:
        head = CLEAN_HEADER.format(subject=subj)
    return head + '\n\n' + body

In [8]:
# we could just take emails we classify correctly here and use them to run our experiments

def run_spam_assassin(msgs, labels):
    predictions = []
    actual = []
    for m, l in zip(msgs, labels):
        m = add_clean_header(m)
        sa = SpamAssassin(bytes(m, 'utf-8'))
        if sa.is_spam():
            pred = 1
        else:
            pred = 0
        predictions.append(pred)
        actual.append(l)
    return predictions, actual

predictions, actual = run_spam_assassin(msgs, labels)

In [9]:
from sklearn import metrics
print(metrics.classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       0.52      1.00      0.68      1000
           1       0.97      0.07      0.13      1000

    accuracy                           0.53      2000
   macro avg       0.74      0.53      0.41      2000
weighted avg       0.74      0.53      0.41      2000

