In [1]:
import re
from nltk import PorterStemmer
import numpy as np
from scipy.io import loadmat
from sklearn import svm

In [2]:
def readFile(file_name):
    with open(file_name) as f:
        return f.read()

def emailFeatures(word_indices):
    
    # Total number of words in the dictionary
    n = 1899;

    # You need to return the following variables correctly.
    x = np.zeros((n, 1))
    x[word_indices, 0] = 1
    return x

def getVocabList():
    #GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a
    #cell array of the words
    #   vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt 
    #   and returns a cell array of the words in vocabList.


    ## Read the fixed vocabulary list
    with open('vocab.txt', 'r') as vocabFile:

        # Store all dictionary words in dictionary vocabList
        vocabList = {}
        for line in vocabFile.readlines():
            i, word = line.split()
            vocabList[word] = int(i)

    return vocabList

def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices 
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses 
    #   the body of an email and returns a list of indices of the 
    #   words contained in the email. 
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    l = 0

    # Slightly different order from matlab version

    # Split and also get rid of any punctuation
    # regex may need further debugging...
    email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents)

    for token in email_contents:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word 
        token = PorterStemmer().stem(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
           continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices 
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        # 
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        # 
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #

        idx = vocabList[token] if token in vocabList else 0

        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0, 
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)

        # =============================================================


        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print('{:s}'.format(token)),
        l = l + len(token) + 1

    # Print footer
    print('\n\n=========================\n')

    return word_indices

Preprocessing sample email (emailSample1.txt)

In [3]:
file_contents = readFile("emailSample1.txt")
print(file_contents)
word_indices = processEmail(file_contents)
print("Word indices: ", word_indices)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com



==== Processed Email ====


anyon
know
how
much
it
cost
to
host
a
web
portal
well
it
depend
on
how
mani

visitor
you
re
expect
thi
can
be
anywher
from
less
than
number
buck
a
month

to
a
coupl
of
dollarnumb
you
should
checkout
httpaddr
or
perhap
amazon
ecnumb

if
your
run
someth
big
to
unsubscrib
yourself
from
thi
mail
list
send
an

email
to
emailaddr



Word indices:  [86, 916, 794, 1077, 883, 370, 1699, 790, 1822, 1831, 883, 431, 1171, 794, 1002, 1893, 1364, 592, 1676, 238, 162, 89, 688, 945, 1663, 1120, 1062, 1699, 375, 1162, 479, 1893, 1510, 799, 1182, 1237, 810, 1895, 1440, 1547, 181, 1699, 17

In [4]:
features = emailFeatures(word_indices)
print("Length of feature vector ", features.size)
print("Number of non-zero entries ", np.sum(features>0))

Length of feature vector  1899
Number of non-zero entries  45


Load the Spam Email dataset

In [5]:
data = loadmat("spamTrain.mat")
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [6]:
X = data['X']
y = data['y']

In [7]:
model = svm.SVC(C=1.0, kernel="linear")
model.fit(X, y.ravel())
pred = model.predict(X)

In [8]:
pred = np.reshape(pred, (pred.size, 1))
print("Accurancy: ", np.mean(pred == y) * 100)

Accurancy:  99.975


Load test data

In [9]:
data_test = loadmat("spamTest.mat")
data_test.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Xtest', 'ytest'])

In [10]:
Xtest = data_test["Xtest"]
ytest = data_test["ytest"]

In [11]:
test_pred = model.predict(Xtest)
test_pred = np.reshape(test_pred, ytest.shape)
print("Accurancy: ", np.mean(test_pred == ytest) * 100)

Accurancy:  97.8


Test with my email

In [13]:
my_email = readFile("spamSample2.txt")
print(my_email)

Best Buy Viagra Generic Online

Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!

We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru





In [14]:
wi = processEmail(my_email)
my_email_features = emailFeatures(wi)


==== Processed Email ====


best
buy
viagra
gener
onlin
viagra
numbermg
x
number
pill
dollarnumb
free

pill
reorder
discount
top
sell
number
qualiti
satisfact
guarante
we
accept

visa
master
echeck
payment
number
satisfi
custom
httpaddr





In [15]:
result = model.predict(my_email_features.T)
print(result)

[1]
