In [99]:
# Machine Learning Online Class
#  Exercise 6 | Spam Classification with SVMs
#
#  Instructions
#  ------------
# 
#  This file contains code that helps you get started on the
#  exercise. You will need to complete the following functions:
#
#     gaussianKernel.m
#     dataset3Params.m
#     processEmail.m
#     emailFeatures.m
#
#  For this exercise, you will not need to change any code in this file,
#  or any other files other than those mentioned above.
import numpy as np
import scipy.io as scio
from sklearn.svm import SVC

# 1. Email处理

In [8]:
def readFile(filename):
    #READFILE reads a file and returns its entire contents 
    #   file_contents = READFILE(filename) reads a file and returns its entire
    #   contents in file_contents
    #

    # Load File
    '''fobj = open(filename)
    try:
        file_contents = fobj.read()
    except Exception as e:
        print('Unable to open %s\n'%filename);'''
    
    with open(filename) as fobj:
        file_contents = fobj.read()
    
    return file_contents

In [6]:
def getVocabList():
    #GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a
    #cell array of the words
    #   vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt 
    #   and returns a cell array of the words in vocabList.

    # Read the fixed vocabulary list
    #fobj = fopen('vocab.txt');

    # Store all dictionary words in cell array vocab{}
    #n = 1899;  # Total number of words in the dictionary

    # For ease of implementation, we use a struct to map the strings => integers
    # In practice, you'll want to use some form of hashmap
    #vocabList = cell(n, 1)
    vocabList = []
    
    with open('vocab.txt') as fobj:
        for line in fobj.readlines():
            vocabList.append(line.split('\t')[1].strip('\n'))

    return vocabList

In [88]:
import re
from nltk.stem.porter import PorterStemmer 
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices 
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses 
    #   the body of an email and returns a list of indices of the 
    #   words contained in the email. 
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>',' ',email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+','number',email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*','httpaddr',email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+','emailaddr',email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+','dollar',email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n');

    # Process file
    l = 0;

    while email_contents !='':

        # Tokenize and also get rid of any punctuation
        split_list = re.split(r'[ @$/#.-:&*+=\[\]?!(){},\'">_<;% \n\r]',email_contents, 1) 
        #split_list = re.split(r'[ @$/#-:&*+=[]!(){},>_<;%]',email_contents, 1) 
        #split_list = re.split('[>@]',email_contents, 1) 
           #re.split([' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)],email_contents, 1); 
            # char(10)换行键 char(13)回车键
        #print (split_list)
        str, email_contents = split_list[0],split_list[1]

        # Remove any non alphanumeric characters
        str = re.sub('[^a-zA-Z0-9]', '', str);

        # Stem the word 
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        '''try str = porterStemmer(strtrim(str)); 
        catch str = ''; continue;
        end;'''
        
        try:
            str = PorterStemmer().stem(str.strip())
        except Exception as e:
            print('Exception')
        

        # Skip the word if it is too short
        if len(str) <= 1:
           continue
        

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices 
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        # 
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        # 
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #
        try:
            indice = vocabList.index(str)
            word_indices.append(indice)
        except Exception as e:
            print("%s not in vocabList"%str)


        # =============================================================

        # Print to screen, ensuring that the output lines are not too long
        if (l + len(str) + 1) > 78:
            print('\n');
            l = 0;

        print('%s '%str);
        l = l + len(str) + 1;

    # Print footer
    print('\n\n=========================\n');
    return word_indices

In [89]:
# ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.

print('\nPreprocessing sample email (emailSample1.txt)\n');

# Extract Features
file_contents = readFile('emailSample1.txt');
word_indices  = processEmail(file_contents);

# Print Stats
print('Word Indices: \n');
print(word_indices)
print('\n\n');


Preprocessing sample email (emailSample1.txt)


==== Processed Email ====


anyon 
know 
how 
much 
it 
cost 
to 
host 
web 
portal not in vocabList
portal 
well 
it 
depend 
on 
how 
mani 
visitor not in vocabList


visitor 
you 
re 
expect 
thi 
can 
be 
anywher 
from 
less 
than 
number 
buck not in vocabList
buck 
month 
to 


coupl 
of 
dollarnumb 
you 
should 
checkout not in vocabList
checkout 
httpaddr 
or 
perhap 
amazon not in vocabList
amazon 
ecnumb not in vocabList
ecnumb 
if 


your 
run 
someth 
big 
to 
unsubscrib 
yourself 
from 
thi 
mail 
list 
send 
an 
email 


to 
emailaddr 



Word Indices: 

[85, 915, 793, 1076, 882, 369, 1698, 789, 1821, 1830, 882, 430, 1170, 793, 1001, 1892, 1363, 591, 1675, 237, 161, 88, 687, 944, 1662, 1119, 1061, 1698, 374, 1161, 478, 1892, 1509, 798, 1181, 1236, 809, 1894, 1439, 1546, 180, 1698, 1757, 1895, 687, 1675, 991, 960, 1476, 70, 529, 1698, 530]





# 2. 特征提取

In [233]:
def emailFeatures(word_indices):
    #EMAILFEATURES takes in a word_indices vector and produces a feature vector
    #from the word indices
    #   x = EMAILFEATURES(word_indices) takes in a word_indices vector and 
    #   produces a feature vector from the word indices. 

    # Total number of words in the dictionary
    #n = 1899; # hard code?
    # Load Vocabulary
    n = len(getVocabList())

    # You need to return the following variables correctly.
    x = np.zeros((n, 1))

    # ====================== YOUR CODE HERE ======================
    # Instructions: Fill in this function to return a feature vector for the
    #               given email (word_indices). To help make it easier to 
    #               process the emails, we have have already pre-processed each
    #               email and converted each word in the email into an index in
    #               a fixed dictionary (of 1899 words). The variable
    #               word_indices contains the list of indices of the words
    #               which occur in one email.
    # 
    #               Concretely, if an email has the text:
    #
    #                  The quick brown fox jumped over the lazy dog.
    #
    #               Then, the word_indices vector for this text might look 
    #               like:
    #               
    #                   60  100   33   44   10     53  60  58   5
    #
    #               where, we have mapped each word onto a number, for example:
    #
    #                   the   -- 60
    #                   quick -- 100
    #                   ...
    #
    #              (note: the above numbers are just an example and are not the
    #               actual mappings).
    #
    #              Your task is take one such word_indices vector and construct
    #              a binary feature vector that indicates whether a particular
    #              word occurs in the email. That is, x(i) = 1 when word i
    #              is present in the email. Concretely, if the word 'the' (say,
    #              index 60) appears in the email, then x(60) = 1. The feature
    #              vector should look like:
    #
    #              x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..];
    #
    #
    x = np.array([1 if i in word_indices else 0 for i in range(n)]).reshape(1,-1)

    # =========================================================================
    return x

In [234]:
# ==================== Part 2: Feature Extraction ====================
#  Now, you will convert each email into a vector of features in R^n. 
#  You should complete the code in emailFeatures.m to produce a feature
#  vector for a given email.

print('\nExtracting features from sample email (emailSample1.txt)\n');

# Extract Features
file_contents = readFile('emailSample1.txt');
word_indices  = processEmail(file_contents);
features      = emailFeatures(word_indices);

# Print Stats
print('Length of feature vector: %d\n'%len(features))
print('Number of non-zero entries: %d\n'%np.sum(features > 0));


Extracting features from sample email (emailSample1.txt)


==== Processed Email ====


anyon 
know 
how 
much 
it 
cost 
to 
host 
web 
portal not in vocabList
portal 
well 
it 
depend 
on 
how 
mani 
visitor not in vocabList


visitor 
you 
re 
expect 
thi 
can 
be 
anywher 
from 
less 
than 
number 
buck not in vocabList
buck 
month 
to 


coupl 
of 
dollarnumb 
you 
should 
checkout not in vocabList
checkout 
httpaddr 
or 
perhap 
amazon not in vocabList
amazon 
ecnumb not in vocabList
ecnumb 
if 


your 
run 
someth 
big 
to 
unsubscrib 
yourself 
from 
thi 
mail 
list 
send 
an 
email 


to 
emailaddr 



Length of feature vector: 1

Number of non-zero entries: 45



因为有重复词，所以非0元素个数为45

# 3. 用线性SVM训练垃圾邮件分类器

In [None]:
# =========== Part 3: Train Linear SVM for Spam Classification ========
#  In this section, you will train a linear classifier to determine if an
#  email is Spam or Not-Spam.

# Load the Spam Email dataset
# You will have X, y in your environment
data = scio.loadmat('spamTrain.mat');
X = data['X']
y = data['y']

In [96]:
print(X)
print(y)

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[[1]
 [1]
 [0]
 ..., 
 [1]
 [0]
 [0]]


In [97]:
len(X)

4000

用LinearSVC训练

In [215]:
from sklearn.svm import LinearSVC
print('\nTraining Linear SVM (Spam Classification)\n')
print('(this may take 1 to 2 minutes) ...\n')

C = 0.1;
#model = svmTrain(X, y, C, @linearKernel);
model = LinearSVC(C=C)
model.fit(X,y.ravel())

p = model.predict(X)

print('Training Accuracy: %f\n'%(np.mean(np.double(p.reshape(-1,1) == y)) * 100))


Training Linear SVM (Spam Classification)

(this may take 1 to 2 minutes) ...

Training Accuracy: 99.975000



运行非常快（1秒 vs SVC的十几秒）
运行多次结果也相同（未加random_state参数）

# 4. 测试垃圾邮件分类器

In [216]:
# =================== Part 4: Test Spam Classification ================
#  After training the classifier, we can evaluate it on a test set. We have
#  included a test set in spamTest.mat

# Load the test dataset
# You will have Xtest, ytest in your environment
spamtest = scio.loadmat('spamTest.mat');

Xtest = spamtest['Xtest']
ytest = spamtest['ytest']

print('\nEvaluating the trained Linear SVM on a test set ...\n')

p = model.predict(Xtest)

print('Test Accuracy: %f\n'%(np.mean(np.double(p.reshape(-1,1) == ytest)) * 100));


Evaluating the trained Linear SVM on a test set ...

Test Accuracy: 99.200000



# 5. 对垃圾邮件预测度最高的词

In [217]:
# ================= Part 5: Top Predictors of Spam ====================
#  Since the model we are training is a linear SVM, we can inspect the
#  weights learned by the model to understand better how it is determining
#  whether an email is spam or not. The following code finds the words with
#  the highest weights in the classifier. Informally, the classifier
#  'thinks' that these words are the most likely indicators of spam.
#

# Sort the weights and obtin the vocabulary list
#weight, idx = sort(model.coef_, 'descend');
idx = np.argsort(-model.coef_,) # descend sorting
#idx由model的权重参数排序位置组成
#print(model.coef_)
vocabList = getVocabList();

print('\nTop predictors of spam: \n');
for i in range(15): # 列出权重值最高的15个词
    #print(' %-15s (%f) \n'%(vocabList[idx[i]], weight[i]))
    print(vocabList[idx[0,i]])
    print(model.coef_[0,idx[0,i]])


Top predictors of spam: 

our
0.421665082918
remov
0.387173146899
click
0.387059738578
basenumb
0.346617244183
guarante
0.341685556413
visit
0.303027776237
bodi
0.263523552183
will
0.244393910157
numberb
0.238794974038
price
0.234199073925
dollar
0.232314845744
nbsp
0.227080751854
below
0.22319898392
lo
0.219993756217
most
0.214548559064


和作业MATLAB程序的结果有一定差异，使用线性核函数的SVC分类器试试

In [219]:
from sklearn.svm import SVC
print('\nTraining Linear SVM (Spam Classification)\n using SVC of linear kernel')
print('(this may take 1 to 2 minutes) ...\n')

C = 0.1;
#model = svmTrain(X, y, C, @linearKernel);
model = SVC(C=C,kernel = 'linear')
model.fit(X,y.ravel())

p = model.predict(X)

print('Training Accuracy: %f\n'%(np.mean(np.double(p.reshape(-1,1) == y)) * 100))


Training Linear SVM (Spam Classification)
 using SVC of linear kernel
(this may take 1 to 2 minutes) ...

Training Accuracy: 99.825000



In [220]:
# Load the test dataset
# You will have Xtest, ytest in your environment
spamtest = scio.loadmat('spamTest.mat');

Xtest = spamtest['Xtest']
ytest = spamtest['ytest']

print('\nEvaluating the trained Linear SVM on a test set ...\n')

p = model.predict(Xtest)

print('Test Accuracy: %f\n'%(np.mean(np.double(p.reshape(-1,1) == ytest)) * 100));


Evaluating the trained Linear SVM on a test set ...

Test Accuracy: 98.900000



In [221]:
# Sort the weights and obtin the vocabulary list
#weight, idx = sort(model.coef_, 'descend');
idx = np.argsort(-model.coef_,) # descend sorting
#idx由model的权重参数排序位置组成
#print(model.coef_)
vocabList = getVocabList();

print('\nTop predictors of spam: \n');
for i in range(15): # 列出权重值最高的15个词
    #print(' %-15s (%f) \n'%(vocabList[idx[i]], weight[i]))
    print(vocabList[idx[0,i]])
    print(model.coef_[0,idx[0,i]])


Top predictors of spam: 

our
0.500613736175
click
0.465916390689
remov
0.422869117061
guarante
0.383621601794
visit
0.367710398246
basenumb
0.345064097946
dollar
0.323632035796
will
0.269724106037
price
0.267297714618
pleas
0.2611688867
most
0.257298197952
nbsp
0.25394145516
lo
0.253466524314
ga
0.248296990456
hour
0.246404357832


结果与MATLAB程序结果更接近

以下部分为optional

# 6. 尝试自己的Email

In [235]:
# =================== Part 6: Try Your Own Emails =====================
#  Now that you've trained the spam classifier, you can use it on your own
#  emails! In the starter code, we have included spamSample1.txt,
#  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
#  The following code reads in one of these emails and then uses your 
#  learned SVM classifier to determine whether the email is Spam or 
#  Not Spam

# Set the file to be read in (change this to spamSample2.txt,
# emailSample1.txt or emailSample2.txt to see different predictions on
# different emails types). Try your own emails as well!
filename = 'spamSample1.txt';

# Read and predict
file_contents = readFile(filename);
word_indices  = processEmail(file_contents);
#print(word_indices)
x             = emailFeatures(word_indices);
#print(x)
p = model.predict(x)

print('\nProcessed %s\n\nSpam Classification: %d\n'%(filename, p))
print('(1 indicates spam, 0 indicates not spam)\n\n')


==== Processed Email ====


do 
you 
want 
to 
make 
dollarnumb 
or 
more 
per 
week 
if 
you 
are not in vocabList
are 
motiv 
and 
qualifi 


individu 
will 
person 
demonstr not in vocabList
demonstr 
to 
you 
system 
that 
will 
make 
you 
dollarnumb 


number 
per 
week 
or 
more 
thi 
is 
not 
mlm not in vocabList
mlm 
call 
our 
number 
hour 
prerecord not in vocabList
prerecord 
number 


to 
get 
the 
detail 
numbernumbernumb not in vocabList
numbernumbernumb 
need 
peopl 
who 
want 
to 
make 
seriou 
money not in vocabList
money 


make 
the 
call 
and 
get 
the 
fact 
invest 
number 
minut 
in 
yourself 
now 
numbernumbernumb not in vocabList


numbernumbernumb 
look 
forward 
to 
your 
call 
and 
will 
introduc 
you 
to 
peopl 


like 
yourself 
who 
are not in vocabList
are 
current 
make 
dollarnumb 
number 
plu 
per 
week 
numbernumbernumb not in vocabList


numbernumbernumb 
numberljgvnumbernumberleannumberlrmsnumbernumberwxhonumberqiytnumbernumberrjuvnumberhqcfnumbern

In [226]:
X.shape

(4000, 1899)

In [232]:
x.shape

(1899,)