In [1]:
import numpy as np
import matplotlib.pyplot as plt 
from scipy.io import loadmat
from sklearn import svm
import re
from stemming.porter2 import stem
import nltk, nltk.stem.porter
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Preprocessing Emails

In [3]:
!cat ./data/emailSample1.txt

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com



In [4]:
def preProcess(email):
    """
    Function to do some pre processing (simplification of e-mails).
    Comments throughout implementation describe what it does.
    Input = raw e-mail
    Output = processed (simplified) email
    """
    # Make the entire e-mail lower case
    email = email.lower()
    
    # Strip html tags (strings that look like <blah> where 'blah' does not
    # contain '<' or '>')... replace with a space
    email = re.sub('<[^<>]+>', ' ', email);
    
    #Any numbers get replaced with the string 'number'
    email = re.sub('[0-9]+', 'number', email)
    
    #Anything starting with http or https:// replaced with 'httpaddr'
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    
    #Strings with "@" in the middle are considered emails --> 'emailaddr'
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email);
    
    #The '$' sign gets replaced with 'dollar'
    email = re.sub('[$]+', 'dollar', email);
    
    return email

In [5]:
email_contents = open('./data/emailSample1.txt').read()
preProcess(email_contents)                      

"> anyone knows how much it costs to host a web portal ?\n>\nwell, it depends on how many visitors you're expecting.\nthis can be anywhere from less than number bucks a month to a couple of dollarnumber. \nyou should checkout httpaddr or perhaps amazon ecnumber \nif youre running something big..\n\nto unsubscribe yourself from this mailing list, send an email to:\nemailaddr\n\n"

In [6]:
def email2TokenList( raw_email ):
    """
    Function that takes in preprocessed (simplified) email, tokenizes it,
    stems each word, and returns an (ordered) list of tokens in the e-mail
    提取出预处理后的email中的每个单词，并加入到list中
    """
    
    # I'll use the NLTK stemmer because it more accurately duplicates the
    # performance of the OCTAVE implementation in the assignment
    stemmer = nltk.stem.porter.PorterStemmer()
    
    email = preProcess( raw_email )

    #Split the e-mail into individual words (tokens) (split by the delimiter ' ')
    #but also split by delimiters '@', '$', '/', etc etc
    #Splitting by many delimiters is easiest with re.split()
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    
    #Loop over each word (token) and use a stemmer to shorten it,
    #then check if the word is in the vocab_list... if it is,
    #store what index in the vocab_list the word is
    tokenlist = []
    for token in tokens:
        
        #Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token);

        #Use the Porter stemmer to stem the word
        stemmed = stemmer.stem( token )
        
        #Throw out empty tokens
        if not len(token): continue
            
        #Store a list of all unique stemmed words
        tokenlist.append(stemmed)
            
    return tokenlist

In [7]:
def getVocabDict(reverse=False):
    """
    Function to read in the supplied vocab list text file into a dictionary.
    I'll use this for now, but since I'm using a slightly different stemmer,
    I'd like to generate this list myself from some sort of data set...
    Dictionary key is the stemmed word, value is the index in the text file
    If "reverse", the keys and values are switched.
    读取vocab list并将其转换成dictionary
    """
    vocab_dict = {}
    with open("./data/vocab.txt") as f:
        for line in f:
            (val, key) = line.split()
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key
                
    return vocab_dict

In [8]:
def email2VocabIndices( raw_email, vocab_dict ):
    """
    Function that takes in a raw email and returns a list of indices corresponding
    to the location in vocab_dict for each stemmed word in the email.
    如果email中的单词出现在vocab_dict中，则将该单词对应的索引加入list，否则跳过
    """
    tokenlist = email2TokenList( raw_email )
    index_list = [ vocab_dict[token] for token in tokenlist if token in vocab_dict ]
    return index_list

In [9]:
# Extracting Features from Emails

In [10]:
def email2FeatureVector( raw_email, vocab_dict ):
    """
    Function that takes as input a raw email, and returns a vector of shape
    (n,1) where n is the size of the vocab_dict.
    The first element in this vector is 1 if the vocab word with index == 1
    is in the raw_email, 0 otherwise.
    将特征向量化，向量的长度为vocab_dict的大小。
    如果email中的单词出现在vocab_dict中，则将向量中对应位置的分量置为1，否则置为0
    """
    n = len(vocab_dict)
    result = np.zeros((n,1))
    vocab_indices = email2VocabIndices( email_contents, vocab_dict ) # 返回index
    for idx in vocab_indices:
        result[idx] = 1
    return result

In [11]:
vocab_dict = getVocabDict()
email_contents = open('./data/emailSample1.txt').read()
feature_v = email2FeatureVector(email_contents, vocab_dict)
len(feature_v), np.sum(feature_v)                 

(1899, 45.0)

In [12]:
path = './data/spamTrain.mat'
path2 = './data/spamTest.mat'

train_data = loadmat(path)
X = train_data['X']
y = train_data['y']

test_data = loadmat(path2)
Xtest = test_data['Xtest']
ytest = test_data['ytest']

X.shape, y.shape, Xtest.shape, ytest.shape

((4000, 1899), (4000, 1), (1000, 1899), (1000, 1))

In [13]:
X_pos = np.array([X[i] for i in range(X.shape[0]) if y[i] == 1])
X_neg = np.array([X[i] for i in range(X.shape[0]) if y[i] == 0])
X_pos.shape, X_neg.shape 

((1277, 1899), (2723, 1899))

In [14]:
C = 0.1
linear_svm = svm.SVC(C=C, kernel='linear')
linear_svm.fit(X, y)

SVC(C=0.1, kernel='linear')

In [15]:
# 训练集准确率
train_pred = linear_svm.score(X, y)

# 测试集准确率
test_pred = linear_svm.score(Xtest, ytest)

print(f'Train acuuracy: {train_pred * 100}%')
print(f'Test accuracy: {test_pred * 100}%')

Train acuuracy: 99.825%
Test accuracy: 98.9%


In [16]:
# 对前面处理过的邮件的预测
linear_svm.predict(feature_v.T)

array([0], dtype=uint8)

In [17]:
# Determine the words most likely to indicate an e-mail is a spam
# From the trained SVM we can get a list of the weight coefficients for each
# word (technically, each word index)

vocab_dict_flipped = getVocabDict(reverse=True)

# Sort indicies from most important to least-important (high to low weight)
# The absolute size of the coefficient relative to the other ones gives an indication of how important the feature was for the separation. 
sorted_indices = np.argsort( linear_svm.coef_, axis=None )[::-1]
print ("The 15 most important words to classify a spam e-mail are:")
print ([ vocab_dict_flipped[x] for x in sorted_indices[:15] ])

The 15 most important words to classify a spam e-mail are:
['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']
