# Spam Filtering Using [Euron's Dataset][1]
[1]: [http://www.aueb.gr/users/ion/data/enron-spam/]

In [1]:
from pymldb import Connection
mldb = Connection('http://localhost/')

Let's start by defining a function to parse, merge and import the Enron data

In [2]:
import sys, tarfile, gzip
import requests
from random import randrange, seed
from StringIO import StringIO
from pymldb import Connection

# remove control chars that mldb doesn't like at the moment (see MLDB-1630)
# http://stackoverflow.com/a/93029/1067132
import unicodedata, re
all_chars = (unichr(i) for i in xrange(0x110000))
control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))

def remove_control_chars(s):
    return control_char_re.sub('', s)

seed(1234)

enron_base_url = 'http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/'
enron_data_url = enron_base_url + 'enron{}.tar.gz'

def add_enron_file_to_dataset(mldb, dataset, no, max_msg=None):
    req = requests.get(enron_data_url.format(no))
    if req.status_code != 200:
        raise RuntimeError('enron files not found')
    content = StringIO(req.content)
    gz = gzip.GzipFile(fileobj=content)
    file = tarfile.TarFile(fileobj=gz)

    files = file.getnames()
    ham = sorted([f for f in files if f.endswith('.ham.txt')])
    spam = sorted([f for f in files if f.endswith('.spam.txt')])
    # We insert the spam randomly in the ham, but keeping the ordering. It
    # follows the logic from the article pointed out here:
    # http://www.aueb.gr/users/ion/data/enron-spam/readme.txt
    where_to_insert = \
        sorted([randrange(len(ham) + 1) for i in xrange(len(spam))])
    # Simply taking into account the fact that the list with get bigger every
    # time we add a new item
    where_to_insert = [x + i for i,x in enumerate(where_to_insert)]

    ham_spam = ham
    for w,s in zip(where_to_insert, spam):
        ham_spam.insert(w, s)

    for i, name in enumerate(ham_spam):
        msg = file.extractfile(name).read()
        # mldb doesn't like some funny characters, which are present in some
        # mails, so let's get rid of them
        msg = msg.decode('utf-8', 'ignore')
        msg = remove_control_chars(msg)
        
        msg = msg.replace('\r\n', '\n')
        mldb.post(dataset + '/rows', {
            'rowName': 'enron_{}_mail_{}'.format(no,i),
            'columns': [
                ['label', 'spam' if 'spam' in name else 'ham', 0],
                ['index', i, 0],
                ['msg', msg, 0],
                ['dataset', no, 0],
                ['file', name, 0]]})

        if max_msg is not None and i >= max_msg - 1:
            break

Now let's load the 1st of Enron's datasets (there are 6) into MDLB, using the function we've just defined.

In [3]:
mldb.put('/v1/datasets/enron_data', {'type': 'sparse.mutable'})
add_enron_file_to_dataset(mldb, '/v1/datasets/enron_data', 1)
mldb.post('/v1/datasets/enron_data/commit')

This is what the dataset looks like.

*index*: order in which the emails arrived in the user's inbox  
*msg*: actual content of the email  
*label*: was the email legitimate (*ham*) or not (*spam*)  

In [4]:
mldb.query('select index, msg, label from enron_data order by index limit 10')

Unnamed: 0_level_0,index,label,msg
_rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
enron_1_mail_0,0,spam,Subject: dobmeos with hgh my energy level has ...
enron_1_mail_1,1,spam,Subject: your prescription is ready . . oxwq s...
enron_1_mail_2,2,ham,Subject: christmas tree farm pictures
enron_1_mail_3,3,ham,"Subject: vastar resources , inc .gary , produc..."
enron_1_mail_4,4,ham,Subject: calpine daily gas nomination- calpine...
enron_1_mail_5,5,ham,Subject: re : issuefyi - see note below - alre...
enron_1_mail_6,6,ham,Subject: meter 7268 nov allocationfyi .- - - -...
enron_1_mail_7,7,spam,Subject: get that new car 8434people nowthe we...
enron_1_mail_8,8,ham,"Subject: mcmullen gas for 11 / 99jackie ,since..."
enron_1_mail_9,9,spam,"Subject: await your responsedear partner ,we a..."


Let's create a *sql.expression* that will simply tokenize the emails into a bag of words. Those will be our features on which we will train a classifier.

In [5]:
print mldb.put('/v1/functions/bow', {
    'type': 'sql.expression',
    'params': {
        'expression': """
            tokenize(msg, {splitchars: ' \n', quotechar: ''}) as bow
            """
    }
})

<Response [201]>


Then we can generate the features for the whole dataset, and write them into a new dataset, using the *transform* procedure.

In [6]:
print mldb.put('/v1/procedures/generate_feats', {
    'type': 'transform',
    'params': {
        'inputData': """
            select bow({msg}) as features, label = 'spam' as label
            from enron_data
            """,
        'outputDataset': 'enron_features',
        'runOnCreation': True
    }
})

<Response [201]>


Here is a snapshot of the sparse feature matrix:

In [7]:
mldb.query('select * from enron_features limit 10')

Unnamed: 0_level_0,"features.bow.""""""""",features.bow.(,features.bow.),"features.bow.,kathy","features.bow.,please",features.bow.-,"features.bow."".i""","features.bow."".thanks""","features.bow."".""",features.bow./,...,features.bow.rr,features.bow.signet,features.bow.some,features.bow.strings,features.bow.tell,features.bow.they,features.bow.tolerable,features.bow.tonightnomore,features.bow.umpire,features.bow.wives
_rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
enron_1_mail_3055,2.0,1.0,2.0,1.0,1.0,3.0,1.0,1.0,11.0,2.0,...,,,,,,,,,,
enron_1_mail_3946,,,,,1.0,61.0,,1.0,10.0,36.0,...,,,,,,,,,,
enron_1_mail_4333,,,,,,,,,,,...,,,,,,,,,,
enron_1_mail_2698,,3.0,3.0,,,8.0,,,9.0,,...,,,,,,,,,,
enron_1_mail_3003,,1.0,2.0,,,5.0,,,7.0,,...,,,,,,,,,,
enron_1_mail_1872,1.0,,,,,1.0,,,4.0,1.0,...,,,,,,,,,,
enron_1_mail_2515,,,,,,49.0,,,7.0,6.0,...,,,,,,,,,,
enron_1_mail_2136,,2.0,2.0,,,2.0,2.0,,10.0,1.0,...,,,,,,,,,,
enron_1_mail_3483,,,,,,,,,3.0,,...,,,,,,,,,,
enron_1_mail_4482,,,,,,,,,6.0,3.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0


Finally, let's train a very simple classifier, by training on the first half of the messages, and testing on the second half. This classifier will give a score to every email, and we can then choose a threshold where everything above the threshold is classified as spam, and every thing below as ham.

In [8]:
n = mldb.get('/v1/query', q='select count(*) as n from enron_features',
             format='aos').json()[0]['n']

res = mldb.put('/v1/procedures/experiment', {
    'type': 'classifier.experiment',
    'params': {
        'experimentName': 'enron_experiment1',
        'trainingData': 'select {features.*} as features, label from enron_features',
        # for now 50/50 split in time, but we might do something more
        # fancy in a later version!
        'datasetFolds': [{
            'training_limit': n // 2,
            'testing_offset': n // 2,
            'orderBy': 'index',
        }],
        'modelFileUrlPattern': 'file://enron_model_$runid.cls',
        'algorithm': 'bbdt',
        'runOnCreation': True
    }
})
print res

<Response [201]>


In [9]:
print 'AUC =', res.json()['status']['firstRun']['status']['aggregatedTest']['auc']['mean']

AUC = 0.998176833696


Not a bad AUC for a model that simple. But [the AUC score of a classifier is only a very generic measure of performance][1]. When having a specific problem like spam filtering, we're better off using a performance metric that truly matches our intuition about what a good spam filter ought to be. Namely, a good spam filtering algorithm should almost never flag as spam a legitime email, while keeping your inbox as spam-free as possible. This is what should be used to choose the threshold for the classifier, and then to measure its performance.

So instead of the AUC (that doesn't pick a specific threshold but uses all of them), let's use as our performance metric the best [$F_{0.05}$ score][2], which gives 20 times more importance to precision than recall. In other words, this metric represents the fact that classifying as spam **only** what is really spam is 20 times more important than finding all the spam.

Let's see how we are doing with that metric.
[1]: http://mldb.ai/blog/posts/2016/01/ml-meets-economics/
[2]: https://en.wikipedia.org/wiki/F1_score

In [10]:
print mldb.put('/v1/functions/enron_score', {
    'type': 'sql.expression',
    'params': {
        'expression': """
            (1 + pow(ratio, 2)) * (precision * recall) / (precision * pow(ratio, 2) + recall) as enron_score
            """
    }
})

<Response [201]>


In [11]:
mldb.query("""
    select "truePositives", "trueNegatives", "falsePositives", "falseNegatives", precision, recall, score,
           enron_score({precision, recall, ratio:0.05}) as *
    from enron_experiment1_results_0
    order by enron_score desc
""")

Unnamed: 0_level_0,enron_score,falseNegatives,falsePositives,precision,recall,score,trueNegatives,truePositives
_rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
enron_1_mail_1267,0.997378,575,1,0.998920,0.616667,2.095650,3671,925
enron_1_mail_3231,0.997373,576,1,0.998919,0.616000,2.097375,3671,924
enron_1_mail_4686,0.997367,577,1,0.998918,0.615333,2.099251,3671,923
enron_1_mail_2213,0.997362,578,1,0.998917,0.614667,2.100413,3671,922
enron_1_mail_150,0.997356,579,1,0.998915,0.614000,2.100464,3671,921
enron_1_mail_807,0.997351,580,1,0.998914,0.613333,2.109033,3671,920
enron_1_mail_943,0.997345,581,1,0.998913,0.612667,2.110097,3671,919
enron_1_mail_3208,0.997339,582,1,0.998912,0.612000,2.116447,3671,918
enron_1_mail_3225,0.997334,583,1,0.998911,0.611333,2.118578,3671,917
enron_1_mail_4875,0.997328,584,1,0.998909,0.610667,2.120375,3671,916


    As you can see, the best threshold is one where in case of doubt, almost everything is classified as "ham". This leads to 575 spam messages in the inbox, and only 1 ham wrongly filtered as spam. Now how can we improve this?

# To Be Continued...