In [1]:
import os
import pickle
import email_read_util

## Download 2007 TREC Public Spam Corpus
1. Read the "Agreement for use"
   https://plg.uwaterloo.ca/~gvcormac/treccorpus07/

2. Download 255 MB Corpus (trec07p.tgz) and untar into the 'chapter1/datasets' directory

3. Check that the below paths for 'DATA_DIR' and 'LABELS_FILE' exist

In [2]:
DATA_DIR = 'datasets/trec07p/data/'
LABELS_FILE = 'datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [3]:
labels = {}
spam_words = set()
ham_words = set()

In [4]:
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [5]:
# Split corpus into train and test sets
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

In [6]:
if not os.path.exists('blacklist.pkl'):
    for filename in X_train:
        path = os.path.join(DATA_DIR, filename)
        if filename in labels:
            label = labels[filename]
            stems = email_read_util.load(path)
            if not stems:
                continue
            if label == 1:
                ham_words.update(stems)
            elif label == 0:
                spam_words.update(stems)
            else:
                continue
    blacklist = spam_words - ham_words
    pickle.dump(blacklist, open('blacklist.pkl', 'wb'))
else:
    blacklist = pickle.load(open('blacklist.pkl', 'rb') )

print('Blacklist of {} tokens successfully built/loaded'.format(len(blacklist)))

Blacklist of 97939 tokens successfully built/loaded


In [7]:
from nltk.corpus import words
word_set = set(words.words())
word_set.intersection(blacklist)

{'pectora',
 'sleet',
 'soma',
 'sorb',
 'raglan',
 'pluma',
 'thrower',
 'ducal',
 'vatman',
 'biaxial',
 'choral',
 'muzz',
 'merk',
 'degum',
 'lino',
 'punctual',
 'zig',
 'whopper',
 'saunter',
 'commot',
 'pian',
 'enchant',
 'starlit',
 'handmaid',
 'matchbook',
 'chil',
 'prote',
 'cush',
 'feme',
 'cerulean',
 'flamenco',
 'fie',
 'adroit',
 'calor',
 'electrician',
 'batik',
 'depositor',
 'ammonia',
 'dor',
 'humph',
 'throb',
 'osteosarcoma',
 'phylum',
 'staphylococci',
 'redub',
 'romaunt',
 'castrum',
 'billyboy',
 'snug',
 'phosphor',
 'caroli',
 'sweatband',
 'sequent',
 'flutter',
 'abac',
 'subjunct',
 'matriarch',
 'trisect',
 'hazelwood',
 'crept',
 'sienna',
 'camelopard',
 'austral',
 'amaranth',
 'embank',
 'briar',
 'fishtail',
 'shou',
 'chessboard',
 'reman',
 'ponent',
 'coxa',
 'ornithologist',
 'colter',
 'ammono',
 'bajada',
 'heterotopia',
 'cand',
 'rainless',
 'nar',
 'polka',
 'burrow',
 'augend',
 'irrupt',
 'quarrel',
 'nothing',
 'goldfish',
 'fibr

In [8]:
fp = 0
tp = 0
fn = 0
tn = 0

for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        label = labels[filename]
        stems = email_read_util.load(path)
        if not stems:
            continue
        stems_set = set(stems)
        if stems_set & blacklist:
            if label == 1:
                fp = fp + 1
            else:
                tp = tp + 1
        else:
            if label == 1:
                tn = tn + 1
            else:
                fn = fn + 1

In [9]:
from IPython.display import HTML, display
conf_matrix = [[tn, fp],
               [fn, tp]]
display(HTML('<table><tr>{}</tr></table>'.format(
    '</tr><tr>'.join('<td>{}</td>'.format(
        '</td><td>'.join(str(_) for _ in row)) 
                     for row in conf_matrix))))

0,1
6772,714
5835,7543


In [10]:
count = tn + tp + fn + fp
percent_matrix = [["{:.1%}".format(tn/count), "{:.1%}".format(fp/count)],
                  ["{:.1%}".format(fn/count), "{:.1%}".format(tp/count)]]
display(HTML('<table><tr>{}</tr></table>'.format(
    '</tr><tr>'.join('<td>{}</td>'.format(
        '</td><td>'.join(str(_) for _ in row)) 
                     for row in percent_matrix))))

0,1
32.5%,3.4%
28.0%,36.2%


In [11]:
print("Classification accuracy: {}".format("{:.1%}".format((tp+tn)/count)))

Classification accuracy: 68.6%
