Email spam classification


Data source: https://www.kaggle.com/veleon/ham-and-spam-dataset


Use the words in subject to distinguish spam vs. ham emails. 


Inspirations:
- What kind of words do spam emails use?
- Are all links in emails bad?
- Can you classify spam by reading only the subjects of an email?

In [45]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import email
from email.message import EmailMessage
import itertools
from collections import Counter
from sklearn.utils import shuffle
from sklearn.svm import SVC

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
fnames_spam = os.listdir("./hamnspam_dataset/spam")
fnames_ham = os.listdir("./hamnspam_dataset/ham")

wdir_spam = os.path.abspath("./hamnspam_dataset/spam")
wdir_ham = os.path.abspath("./hamnspam_dataset/ham")

In [47]:
print("The number of spam files: ", len(fnames_spam))
print("The number of ham files: ", len(fnames_ham))

The number of spam files:  502
The number of ham files:  2551


In [48]:
# def a function to clean up one subject line
def cleanup (line):

    line2 = line.replace(',', ' xcomma ').replace('-', ' xhyphen ' )\
                .replace('_', ' xunderscore ').replace('!', ' xclaim ')\
                .replace('{', ' xlbrace ').replace('}' , ' xrbrace ')\
                .replace('[', ' xlsquareb ').replace(']', ' xrsquareb ')\
                .replace('(', ' xlbracket ').replace(')', ' xrbracket ')\
                .replace('.', ' xdot ').replace('&', ' xampersand ')\
                .replace('#', ' xhash ').replace('?', ' xquestionm ')\
                .replace('+', ' xplus ').replace('/', ' xslash ')\
                .replace('|', ' xpipe ').replace('"', ' xquote ')\
                .replace('=', ' xequal ').replace('*', ' xasterisk ')\
                .replace(':', ' xcomma ').replace("'ve ", " have ")\
                .replace("'t ", " not ").replace("'re ", " are ")
    
    elems = line2.split()
    del elems[:2]
    
    output_elems = []
    
    for elem in elems:
        if elem.endswith("'s"):
            elem2 = elem[:-2]
        else:
            elem2 = elem
        
        # other cleaning
        elem_lc = elem2.lower()
        if elem_lc == '':
            elem_cln = 'xsign'
        elif "$" in elem_lc:
            elem_cln = 'xdollar'
        elif (".com" in elem_lc) or ("http:/" in elem_lc) or (".org" in elem_lc):
            elem_cln = 'xdotcom'
        elif "%" in elem_lc:
            elem_cln = 'xpercent'
        elif "lb " in elem_lc:
            elem_cln = 'xweight '
        elif elem_lc.replace('.','',1).isdigit():
            elem_cln = 'xdigit'
        elif elem_lc.isalpha():
            elem_cln = elem_lc 
        else:
            elem_cln = "xexotic"
        
        output_elems.append(elem_cln)
        
#         if elem_cln.isalpha() == False:
#             print(elem, elem_cln)
    
    return output_elems


In [52]:
# file parsing
data_spam = []

for i, fname in enumerate(fnames_spam): 
    textfile = os.path.join(wdir_spam, fname)

    with open(textfile, 'r', errors='ignore') as f:
        for line in f.readlines():
            if line.startswith('Subject:'):
                elem_cl = cleanup(line)
                data_spam.append([elem_cl])
                
                
# convert data_spam into pandas DataFrame
df_spam = pd.DataFrame(data=data_spam, columns=['subject'])
df_spam['spam'] = 1
df_spam.to_csv("df_spam.csv")
df_spam.head(3)

Unnamed: 0,subject,spam
0,"[friend, xcomma, copy, any, dvd, or, playstati...",1
1,"[xpercent, guaranteed, for, eight, years]",1
2,"[congratulations, xclaim, you, get, a, free, h...",1


In [50]:
spam_words = list(itertools.chain(*df_spam['subject']))
Counter(spam_words).most_common(10)

[('xdot', 297),
 ('xdigit', 211),
 ('xclaim', 182),
 ('xcomma', 167),
 ('xhyphen', 130),
 ('xexotic', 130),
 ('xequal', 106),
 ('xquestionm', 100),
 ('your', 74),
 ('xlsquareb', 59)]

In [53]:
data_ham = []

for i, fname in enumerate(fnames_ham): 
    textfile = os.path.join(wdir_ham, fname)

    with open(textfile, 'r', errors='ignore') as f:
        for line in f.readlines():
            if line.startswith('Subject:'):
                elem_cl = cleanup(line)
                data_ham.append([elem_cl])
                
                
# convert data_spam into pandas DataFrame
df_ham = pd.DataFrame(data=data_ham, columns=['subject'])
df_ham['spam'] = 0
df_ham.to_csv("df_ham.csv")
df_ham.head(5)

Unnamed: 0,subject,spam
0,"[priceless, rubens, works, stolen, in, raid, o...",0
1,"[making, a, mesh, on, the, move]",0
2,"[re, xcomma, sorting]",0
3,"[re, xcomma, java, is, for, kiddies]",0
4,"[skateboarder, drives, xhyphen, through, subwa...",0


In [54]:
ham_words = list(itertools.chain(*df_ham['subject']))
Counter(ham_words).most_common(10)

[('xcomma', 1924),
 ('re', 1455),
 ('xdot', 1180),
 ('xlsquareb', 877),
 ('xrsquareb', 877),
 ('xdigit', 753),
 ('xhyphen', 570),
 ('the', 389),
 ('for', 297),
 ('xslash', 264)]

In [33]:
# use the top 300 most common words in spam emails to create a vector

numofwords = 100

spam_words_tuple = Counter(spam_words).most_common(numofwords)
spam_words_list = [x[0] for x in spam_words_tuple]
spam_words_list[0:5]

['xdot', 'xdigit', 'xclaim', 'xcomma', 'xhyphen']

In [34]:
ham_words_tuple = Counter(ham_words).most_common(numofwords)
ham_words_list = [x[0] for x in ham_words_tuple]
ham_words_list[0:5]

['xcomma', 're', 'xdot', 'xlsquareb', 'xrsquareb']

In [56]:
# check the intersecting words in spam and ham emails
words_intersect = (set(spam_words_list)).intersection(set(ham_words_list))
words_intersect

{'a',
 'and',
 'are',
 'at',
 'for',
 'from',
 'get',
 'ilug',
 'in',
 'is',
 'new',
 'no',
 'not',
 'of',
 'on',
 're',
 'the',
 'to',
 'with',
 'xampersand',
 'xclaim',
 'xcomma',
 'xdigit',
 'xdot',
 'xexotic',
 'xhyphen',
 'xlbracket',
 'xlsquareb',
 'xquestionm',
 'xquote',
 'xrbracket',
 'xrsquareb',
 'xslash',
 'xunderscore',
 'you'}

In [35]:
# create a dictionary using the spam words as features

words_dict = {}
for i, w in enumerate(spam_words_list):
    words_dict[w] = i

In [36]:
# convert subject words in one email into a vector

def words_to_vector( input_list ):
    vec = np.zeros((1, numofwords))
    for word in input_list:
        if word in words_dict.keys():
            wordid = words_dict[word]
            vec[0, wordid] = 1
    
    return vec

In [37]:
# create a column "code" to dataTables
df_spam['code'] = df_spam['subject'].apply(words_to_vector)

df_spam['code'][0]

array([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [38]:
df_ham['code'] = df_ham['subject'].apply(words_to_vector)

df_ham['code'][0]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [39]:
# divide dataset into train and test randomly

frames = [df_spam, df_ham]
df_all = pd.concat(frames)

df_all_rnd = shuffle(df_all, random_state=9)  
print(df_all_rnd.shape)
df_all_rnd.head(3)

(3154, 3)


Unnamed: 0,subject,spam,code
1534,"[xlsquareb, zzzzteana, xrsquareb, betamax, fin...",0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2357,"[xlsquareb, satalk, xrsquareb, xlsquareb, ot, ...",0,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1994,"[forged, documents, xcomma, public, drinking, ...",0,"[[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [40]:
numtrain = int(df_all_rnd.shape[0]*0.7)

X = np.array(df_all_rnd['code'][:numtrain])
y = np.array(df_all_rnd['spam'][:numtrain])
X2 = np.stack(X)
X3 = np.squeeze(X2, axis=1)

Xtest = df_all_rnd['code'][numtrain:]
Xtest2 = np.stack(Xtest)
Xtest3 = np.squeeze(Xtest2, axis=1)
ytest = df_all_rnd['spam'][numtrain:]

In [41]:
X3

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [42]:
# build svm model using sklearn

clf = SVC(gamma='auto')
clf.fit(X3, y) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
clf.score(X3, y)

0.8799275033982782

In [44]:
clf.score(Xtest3, ytest)

0.8796198521647307