In [89]:
"""
Shows SpamAssassin Python Wrapper at work.

https://pypi.org/project/spamassassin-client/


- SETUP: -
1) Have spamasassin installed
    > sudo apt get spamassassin
2) > pip install spamassassin_client
3) Start spamassassin server
    > sudo spamd
4) Use spamassassin_client

- NOTE: -
REMEMBER TO SHUT DOWN SPAMD SERVER WHEN DONE

"""
import os
from spamassassin_client import SpamAssassin
import pandas as pd
import numpy as np
import util
import re
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
# only get plain text emails from corpus
corpus_spam_path = "data/spam_assassin_corpus/spam_2"
corpus_ham_path = "data/spam_assassin_corpus/easy_ham"

In [91]:
def is_plaintext(email):
    header = email.split("\n\n", 1)[0].strip()
    if "content-type: text/plain" in header.lower():
        return True
    return False

In [92]:
# iterate over files in spam folder
plaintext_spam = []
for filename in os.listdir(corpus_spam_path):
    with open(os.path.join(corpus_spam_path, filename), "r") as f:
        try:
            email = f.read()
            if is_plaintext(email):
                plaintext_spam.append((filename, email))
        except:
            continue
len(plaintext_spam)

347

In [93]:
# iterate over files in ham folder
plaintext_ham = []
for filename in os.listdir(corpus_ham_path):
    with open(os.path.join(corpus_ham_path, filename), "r") as f:
        try:
            email = f.read()
            if is_plaintext(email):
                plaintext_ham.append((filename, email))
        except:
            continue
len(plaintext_ham)

1891

In [94]:
CLEAN_HEADER = """Subject: {subject}
Message-ID: <GTUBE1.1010101@example.net>
Date: Wed, 23 Jul 2003 23:30:00 +0200
From: Sender <sender@example.net>
To: Recipient <recipient@example.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit"""

def replace_header(email):
    spl = email.split("\n\n", 1)
    header = spl[0].strip()
    body = spl[1].strip()

    subj = re.match(r'(.|\n)*subject:(.*)', header, flags = re.IGNORECASE).group(2).strip()
    return CLEAN_HEADER.format(subject=subj) + "\n\n" + body

In [98]:
cleaned_spam = [replace_header(email) for fn, email in plaintext_spam]
cleaned_ham = [replace_header(email) for fn, email in plaintext_ham]

minLen = min(min(len(cleaned_spam), len(cleaned_ham)), 100)
cleaned_spam = cleaned_spam[:minLen]
cleaned_ham = cleaned_ham[:minLen]
print(len(cleaned_spam))
print(len(cleaned_ham))

100
100


In [112]:
spam_labels = [1 for _ in cleaned_spam]
ham_labels = [0 for _ in cleaned_ham]

emails = cleaned_spam + cleaned_ham
labels = spam_labels + ham_labels

input = zip(emails, labels)
# randomly shuffle
input = np.array(list(input))
np.random.shuffle(input)
len(input)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [116]:
preds = []
acts = []
for i, (t, act) in enumerate(input):
    sa = SpamAssassin(bytes(t, encoding="utf-8"))
    pred = 1 if sa.is_spam() else 0 

    preds.append(pred)
    acts.append(int(act))

    print(f'{i+1}/{len(input)}')

1/200
2/200
3/200
4/200
5/200
6/200
7/200
8/200
9/200
10/200
11/200
12/200
13/200
14/200
15/200
16/200
17/200
18/200
19/200
20/200
21/200
22/200
23/200
24/200
25/200
26/200
27/200
28/200
29/200
30/200
31/200
32/200
33/200
34/200
35/200
36/200
37/200
38/200
39/200
40/200
41/200
42/200
43/200
44/200
45/200
46/200
47/200
48/200
49/200
50/200
51/200
52/200
53/200
54/200
55/200
56/200
57/200
58/200
59/200
60/200
61/200
62/200
63/200
64/200
65/200
66/200
67/200
68/200
69/200
70/200
71/200
72/200
73/200
74/200
75/200
76/200
77/200
78/200
79/200
80/200
81/200
82/200
83/200
84/200
85/200
86/200
87/200
88/200
89/200
90/200
91/200
92/200
93/200
94/200
95/200
96/200
97/200
98/200
99/200
100/200
101/200
102/200
103/200
104/200
105/200
106/200
107/200
108/200
109/200
110/200
111/200
112/200
113/200
114/200
115/200
116/200
117/200
118/200
119/200
120/200
121/200
122/200
123/200
124/200
125/200
126/200
127/200
128/200
129/200
130/200
131/200
132/200
133/200
134/200
135/200
136/200
137/200
138/200
139/

In [117]:
from sklearn import metrics
print(metrics.classification_report(acts, preds))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72       100
           1       1.00      0.22      0.36       100

    accuracy                           0.61       200
   macro avg       0.78      0.61      0.54       200
weighted avg       0.78      0.61      0.54       200

