In [7]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [8]:
import nltk
import re
import string
porter = nltk.PorterStemmer()

# Build linear SVM model

In [9]:
from scipy.io import loadmat
train_set = loadmat('spamTrain.mat')

In [10]:
X = train_set['X']
y = train_set['y'].ravel()

In [11]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=0.1)
clf.fit(X,y)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [12]:
y_predicted = clf.predict(X)

### Evaluation

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y, y_predicted)

0.99975

In [14]:
test_set = loadmat('spamTest.mat')

In [15]:
X_test = test_set['Xtest']
y_test = test_set['ytest']

In [16]:
y_test_predicted = clf.predict(X_test)

In [17]:
accuracy_score(y_test, y_test_predicted)

0.992

# Testing the model 

### Cleaning the email
1. replace numbers with "number"
2. replace '$' with "dollar"
3. replace urls with "httpaddr"
4. replace emails with "email"
5. remove any punctuation 
6. replace whitespaces (tabs, newline, multiple space) with a single space
7. apply porter stemmer on words

In [18]:
def clean_email(email):
    
    email = email.lower()
    email = re.sub('[0-9]+','number', email)
    email = re.sub('[$]+','dollar', email)  
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    email = re.sub('[\n]+',' ', email)
    email = re.sub('[\t]+',' ', email)
    email = re.sub('  ',' ', email)
    email = [char for char in email if char not in string.punctuation]
    email = ''.join(email)
    email = email.strip(' ')
    
    return [porter.stem(t) for t in email.split()]

In [19]:
f = open('vocab.txt')
vocablist = f.read()

vocab_map = dict()
for i in open('vocab.txt'):
    word = i.split()
    vocab_map[word[1]] = int(word[0])

In [20]:
def create_feature_vector(email):
    word_indices = list(map(vocab_map.get, email))
    word_indices = [i for i in word_indices if i != None]

    email_features = np.zeros((1899,1))
    email_features[word_indices] = 1
    
    return email_features

## Email Samples

In [21]:
f = open('emailSample1.txt')
email1 = f.read()

In [22]:
email1

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [23]:
cleaned_email1 = clean_email(email1)

In [24]:
cleaned_email1

['anyon',
 'know',
 'how',
 'much',
 'it',
 'cost',
 'to',
 'host',
 'a',
 'web',
 'portal',
 'well',
 'it',
 'depend',
 'on',
 'how',
 'mani',
 'visitor',
 'your',
 'expect',
 'thi',
 'can',
 'be',
 'anywher',
 'from',
 'less',
 'than',
 'number',
 'buck',
 'a',
 'month',
 'to',
 'a',
 'coupl',
 'of',
 'dollarnumb',
 'you',
 'should',
 'checkout',
 'httpaddr',
 'or',
 'perhap',
 'amazon',
 'ecnumb',
 'if',
 'your',
 'run',
 'someth',
 'big',
 'to',
 'unsubscrib',
 'yourself',
 'from',
 'thi',
 'mail',
 'list',
 'send',
 'an',
 'email',
 'to',
 'emailaddr']

In [25]:
features_email1 = create_feature_vector(cleaned_email1)

In [26]:
clf.predict(features_email1.reshape((1,1899)))

array([0], dtype=uint8)

In [27]:
f = open('emailSample2.txt')
email2 = f.read()

In [28]:
cleaned_email2 = clean_email(email2)

In [29]:
features_email2 = create_feature_vector(cleaned_email2)

In [30]:
clf.predict(features_email2.reshape((1,1899)))

array([0], dtype=uint8)

## Spam Samples

In [31]:
f = open('spamSample1.txt')
spam1 = f.read()

In [32]:
cleaned_spam1 = clean_email(spam1)

In [33]:
features_spam1 = create_feature_vector(cleaned_spam1)

In [34]:
clf.predict(features_spam1.reshape((1,1899)))

array([1], dtype=uint8)

In [35]:
f = open('spamSample2.txt')
spam2 = f.read()

In [36]:
cleaned_spam2 = clean_email(spam2)

In [37]:
features_spam2 = create_feature_vector(cleaned_spam2)

In [38]:
clf.predict(features_spam2.reshape((1,1899)))

array([1], dtype=uint8)