In [137]:
"""
Shows SpamAssassin Python Wrapper at work.

https://pypi.org/project/spamassassin-client/


- SETUP: -
1) Have spamasassin installed
    > sudo apt get spamassassin
2) > pip install spamassassin_client
3) Start spamassassin server
    > sudo spamd
4) Use spamassassin_client

- NOTE: -
REMEMBER TO SHUT DOWN SPAMD SERVER WHEN DONE

"""
from spamassassin_client import SpamAssassin
import os
import pandas as pd
import re
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# number of samples we want to work with for each dataset
numSamples = 1000

In [138]:
def run_sa(inputs, sa_lvl=3): # sa_lvl defined as 3 because problem model doesnt consider headers
    predictions = []
    actual = []
    
    for idx, (text, label) in enumerate(inputs):
        sa = SpamAssassin(bytes(text, 'utf-8'))
        pred = 1 if sa.is_spam(level=sa_lvl) else 0

        predictions.append(pred)
        actual.append(label)

        print(f"{idx+1}/{len(inputs)}", end=' ')
    return predictions, actual

In [139]:
from sklearn import metrics
def evaluate(inputs, sa_lvl=3):
    predictions, actual = run_sa(inputs, sa_lvl)
    print(metrics.classification_report(actual, predictions))
    print(metrics.confusion_matrix(actual, predictions))

**Enron**

In [141]:
enron_formatted = pd.read_csv('data/formattedData/enronFormatted.csv')

enron_formatted_spam = enron_formatted[enron_formatted['label'] == 1][:numSamples//2]
enron_formatted_ham = enron_formatted[enron_formatted['label'] == 0][:numSamples//2]

enron_formatted = list(pd.concat([enron_formatted_spam, enron_formatted_ham]).itertuples(index=False, name=None))

In [142]:
print("--- ENRON FORMATTED ---")
evaluate(enron_formatted)

--- ENRON FORMATTED ---
1/1000 2/1000 3/1000 4/1000 5/1000 6/1000 7/1000 8/1000 9/1000 10/1000 11/1000 12/1000 13/1000 14/1000 15/1000 16/1000 17/1000 18/1000 19/1000 20/1000 21/1000 22/1000 23/1000 24/1000 25/1000 26/1000 27/1000 28/1000 29/1000 30/1000 31/1000 32/1000 33/1000 34/1000 35/1000 36/1000 37/1000 38/1000 39/1000 40/1000 41/1000 42/1000 43/1000 44/1000 45/1000 46/1000 47/1000 48/1000 49/1000 50/1000 51/1000 52/1000 53/1000 54/1000 55/1000 56/1000 57/1000 58/1000 59/1000 60/1000 61/1000 62/1000 63/1000 64/1000 65/1000 66/1000 67/1000 68/1000 69/1000 70/1000 71/1000 72/1000 73/1000 74/1000 75/1000 76/1000 77/1000 78/1000 79/1000 80/1000 81/1000 82/1000 83/1000 84/1000 85/1000 86/1000 87/1000 88/1000 89/1000 90/1000 91/1000 92/1000 93/1000 94/1000 95/1000 96/1000 97/1000 98/1000 99/1000 100/1000 101/1000 102/1000 103/1000 104/1000 105/1000 106/1000 107/1000 108/1000 109/1000 110/1000 111/1000 112/1000 113/1000 114/1000 115/1000 116/1000 117/1000 118/1000 119/1000 120/1000 121/

**LING**

In [143]:
ling_formatted = pd.read_csv('data/formattedData/lingFormatted.csv')

ling_formatted_spam = ling_formatted[ling_formatted['label'] == 1][:numSamples//2]
ling_formatted_ham = ling_formatted[ling_formatted['label'] == 0][:numSamples//2]

ling_formatted = list(pd.concat([ling_formatted_spam, ling_formatted_ham]).itertuples(index=False, name=None))

In [145]:
print("--- LING FORMATTED ---")
evaluate(ling_formatted)

--- LING FORMATTED ---
1/933 2/933 3/933 4/933 5/933 6/933 7/933 8/933 9/933 10/933 11/933 12/933 13/933 14/933 15/933 16/933 17/933 18/933 19/933 20/933 21/933 22/933 23/933 24/933 25/933 26/933 27/933 28/933 29/933 30/933 31/933 32/933 33/933 34/933 35/933 36/933 37/933 38/933 39/933 40/933 41/933 42/933 43/933 44/933 45/933 46/933 47/933 48/933 49/933 50/933 51/933 52/933 53/933 54/933 55/933 56/933 57/933 58/933 59/933 60/933 61/933 62/933 63/933 64/933 65/933 66/933 67/933 68/933 69/933 70/933 71/933 72/933 73/933 74/933 75/933 76/933 77/933 78/933 79/933 80/933 81/933 82/933 83/933 84/933 85/933 86/933 87/933 88/933 89/933 90/933 91/933 92/933 93/933 94/933 95/933 96/933 97/933 98/933 99/933 100/933 101/933 102/933 103/933 104/933 105/933 106/933 107/933 108/933 109/933 110/933 111/933 112/933 113/933 114/933 115/933 116/933 117/933 118/933 119/933 120/933 121/933 122/933 123/933 124/933 125/933 126/933 127/933 128/933 129/933 130/933 131/933 132/933 133/933 134/933 135/933 136/9

**SPAM ASSASSIN CORPUS**

Now we generate a dataset of spam emails that were caught well by SpamAssassin. We will apply our attacks to this dataset to see before and after attack accuracy

# sample equal number of emails from each