#SVM 

In [34]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
import os

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [36]:
X_iris, y_iris = load_iris(return_X_y=True)

In [37]:
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris,
    y_iris,
    test_size=0.2,
    random_state=0
)

In [38]:
scaler = StandardScaler()
X_train_iris_scaled = scaler.fit_transform(X_train_iris)
X_test_iris_scaled = scaler.transform(X_test_iris)

# SVM linear kernel

In [39]:
from sklearn.svm import LinearSVC

In [40]:
lin_svm = LinearSVC(C=1, max_iter=10000).fit(X_train_iris_scaled, y_train_iris)
print(f'train accuracy = {lin_svm.score(X_train_iris_scaled, y_train_iris):.3%}')
print(f'test accuracy = {lin_svm.score(X_test_iris_scaled, y_test_iris):.3%}')

train accuracy = 94.167%
test accuracy = 96.667%


#SVC RBF Kernel 

In [41]:
from sklearn.svm import SVC

In [42]:
rbf_svm = SVC(C=10, kernel='rbf', gamma=0.001).fit(X_train_iris_scaled, y_train_iris)
print(f'train accuracy = {rbf_svm.score(X_train_iris_scaled, y_train_iris):.3%}')
print(f'test accuracy = {rbf_svm.score(X_test_iris_scaled, y_test_iris):.3%}')

train accuracy = 91.667%
test accuracy = 83.333%


#SVC Polynomial Kernel

In [43]:
poly_svm = SVC(C=5000, kernel='poly', degree=3).fit(X_train_iris_scaled, y_train_iris)
print(f'train accuracy = {poly_svm.score(X_train_iris_scaled, y_train_iris):.3%}')
print(f'test accuracy = {poly_svm.score(X_test_iris_scaled, y_test_iris):.3%}')

train accuracy = 100.000%
test accuracy = 96.667%


# Spam/Non-spam classification 

#Reveiw sample mail

In [44]:
def get_sample(fn):
    with open(fn, 'r') as f:
        content = f.read()
    return content

cwd = os.getcwd() # current working directory
path = os.path.join(cwd, 'data')
fn = os.path.join(path , 'emailSample1.txt')
content = get_sample(fn)

In [45]:
content

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

# Word tokenization

In [46]:
import re

In [47]:
def word_tokenize(content):
    '''
    content: str - body of mail 
    return: list of tokens (str) e.g. ['>', 'Anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a']
    '''
    # YOUR_CODE.  Split the content to tokens. You may need re.split()
    # START_CODE 
#     tokens = np.array(re.split('[ \n^(,)]', content))
    tokens = np.array(re.split('[ \n^]', content))
    # END_CODE 
    
    return tokens

In [48]:
tokens = word_tokenize(content)
tokens


array(['>', 'Anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host',
       'a', 'web', 'portal', '?', '>', 'Well,', 'it', 'depends', 'on',
       'how', 'many', 'visitors', "you're", 'expecting.', 'This', 'can',
       'be', 'anywhere', 'from', 'less', 'than', '10', 'bucks', 'a',
       'month', 'to', 'a', 'couple', 'of', '$100.', '', 'You', 'should',
       'checkout', 'http://www.rackspace.com/', 'or', 'perhaps', 'Amazon',
       'EC2', '', 'if', 'youre', 'running', 'something', 'big..', '',
       'To', 'unsubscribe', 'yourself', 'from', 'this', 'mailing',
       'list,', 'send', 'an', 'email', 'to:',
       'groupname-unsubscribe@egroups.com', '', ''], dtype='<U33')

# Lower case

In [49]:
def lower_case(tokens):
    '''
    tokens: ndarry of str
    return: ndarry of tokens in lower case (str)
    '''
    # YOUR_CODE.  Make all tokens in lower case
    # START_CODE 
    tokens = np.char.lower(tokens)
    # END_CODE 
   
    return tokens

In [50]:
tokens = lower_case(tokens)
tokens

array(['>', 'anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host',
       'a', 'web', 'portal', '?', '>', 'well,', 'it', 'depends', 'on',
       'how', 'many', 'visitors', "you're", 'expecting.', 'this', 'can',
       'be', 'anywhere', 'from', 'less', 'than', '10', 'bucks', 'a',
       'month', 'to', 'a', 'couple', 'of', '$100.', '', 'you', 'should',
       'checkout', 'http://www.rackspace.com/', 'or', 'perhaps', 'amazon',
       'ec2', '', 'if', 'youre', 'running', 'something', 'big..', '',
       'to', 'unsubscribe', 'yourself', 'from', 'this', 'mailing',
       'list,', 'send', 'an', 'email', 'to:',
       'groupname-unsubscribe@egroups.com', '', ''], dtype='<U33')

# Normalize 

In [51]:
def normalize_tokens (tokens):
    '''
    tokens: ndarry of str
    return: ndarry of tokens replaced with corresponding unified words
    '''
    # YOUR_CODE.
    #  You may  need re.sub()
    # START_CODE 
    patterns = [
        ('<[^<]+?>', ''), # Remove html and other tags
        ('[0-9]{1,100}', 'number'), # mark all numbers "number"
        ('^https?:\/\/(.*)', 'httpaddr'), # mark all  urls as "httpaddr"
        ('[a-z0-9+._-]+@[a-z0-9._-]+\.[a-z0-9_-]+', 'emailaddr'), # mark all emails as "emailaddr"
        ('[$]', 'dollar'), # replace $ as "dollar"
        ('[^\w\s]', '') # get rid of any punctuation | Remove any non alphanumeric characters
    ]
    for pattern in patterns:
        tokens = np.array([re.sub(pattern[0], pattern[1], string) for string in tokens])

    # END_CODE

    return tokens

In [52]:
tokens = normalize_tokens(tokens)
tokens

array(['', 'anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host',
       'a', 'web', 'portal', '', '', 'well', 'it', 'depends', 'on', 'how',
       'many', 'visitors', 'youre', 'expecting', 'this', 'can', 'be',
       'anywhere', 'from', 'less', 'than', 'number', 'bucks', 'a',
       'month', 'to', 'a', 'couple', 'of', 'dollarnumber', '', 'you',
       'should', 'checkout', 'httpaddr', 'or', 'perhaps', 'amazon',
       'ecnumber', '', 'if', 'youre', 'running', 'something', 'big', '',
       'to', 'unsubscribe', 'yourself', 'from', 'this', 'mailing', 'list',
       'send', 'an', 'email', 'to', 'emailaddr', '', ''], dtype='<U12')

# Remove zero length tokens

In [53]:
def filter_short_tokens (tokens):
    '''
    tokens: ndarry of str
    return: ndarry of filtered tokens (str)
    '''
    original_tokens_len = len(tokens)
    
    # YOUR_CODE. Keep only tokens that lenght >0  
    # START_CODE 
    indexes = np.where(tokens != '')
    tokens = tokens[indexes]
    # END_CODE     
   
    print (f'Original len = {original_tokens_len}\nRemaining len = {len(tokens)}')    
    
    return tokens

In [54]:
tokens = filter_short_tokens(tokens)
tokens

Original len = 69
Remaining len = 61


array(['anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a',
       'web', 'portal', 'well', 'it', 'depends', 'on', 'how', 'many',
       'visitors', 'youre', 'expecting', 'this', 'can', 'be', 'anywhere',
       'from', 'less', 'than', 'number', 'bucks', 'a', 'month', 'to', 'a',
       'couple', 'of', 'dollarnumber', 'you', 'should', 'checkout',
       'httpaddr', 'or', 'perhaps', 'amazon', 'ecnumber', 'if', 'youre',
       'running', 'something', 'big', 'to', 'unsubscribe', 'yourself',
       'from', 'this', 'mailing', 'list', 'send', 'an', 'email', 'to',
       'emailaddr'], dtype='<U12')

# Stem tokens

In [55]:
from nltk.stem import PorterStemmer

In [56]:
def stem_tokens(tokens):
    '''
    tokens: ndarry of str
    return: ndarry of stemmed tokens e.g. array(['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a',
       'web', 'portal', 'well', 'it', 'depend', 'on', 'how', 'mani']...
    '''
    # YOUR_CODE. replace the tokens by stemmed fortokens by stemmed formm. You may need PorterStemmer.stem() 
    # START_CODE 
    porter = PorterStemmer()
    tokens = np.array([porter.stem(word) for word in tokens])
    # END_CODE     
   
    return tokens

In [57]:
tokens = stem_tokens(tokens)
tokens

array(['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a',
       'web', 'portal', 'well', 'it', 'depend', 'on', 'how', 'mani',
       'visitor', 'your', 'expect', 'thi', 'can', 'be', 'anywher', 'from',
       'less', 'than', 'number', 'buck', 'a', 'month', 'to', 'a', 'coupl',
       'of', 'dollarnumb', 'you', 'should', 'checkout', 'httpaddr', 'or',
       'perhap', 'amazon', 'ecnumb', 'if', 'your', 'run', 'someth', 'big',
       'to', 'unsubscrib', 'yourself', 'from', 'thi', 'mail', 'list',
       'send', 'an', 'email', 'to', 'emailaddr'], dtype='<U10')

# Vocabulary

In [58]:
def get_vocabulary(fn):
    '''
    fn: str - full path to file 
    return: ndarray of str e.g. array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype=object)
    '''
    vocab_list = pd.read_table(fn, header=None)
    vocab = np.array(vocab_list)[:,1] # first columns is index, select only words column  
    print(f'len(vocab) = {len(vocab):,}')
    return vocab

fn = os.path.join(path , 'vocab.txt')
vocab = get_vocabulary(fn)
vocab

len(vocab) = 1,899


array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype=object)

# Feature reresentation

In [59]:
def represent_features(tokens, vocab):
    '''
    tokens: ndarry of str
    tokens: ndarry of str
    return: ndarry of binary values 1 if word from vocabulary is in mail 0 otherwise
    '''
    # YOUR_CODE. Compute the array with 1/0 corresponding to is word from vocabulary in mail 
    # START_CODE 
    tokens_represented = np.array([1 if token in tokens else 0 for token in vocab.astype('str')])
    # END_CODE     

    print (f'{np.sum(tokens_represented)} word(s) from vocab are in the tokens.')

    return tokens_represented

In [60]:
tokens_represented = represent_features(tokens, vocab)
tokens_represented

44 word(s) from vocab are in the tokens.


array([0, 0, 0, ..., 0, 0, 0])

# Composing all steps of preprocessing

In [61]:
def preprocess (content, vocab):
    '''
    content: str - body of mail 
    vocab: ndarray of str - list of considered words 
    '''
    # YOUR_CODE. Compute the array with 1/0 corresponding to is word from vocabulary in mail 
    # START_CODE 

    # tokenize content    
    tokens  = word_tokenize(content)
    
    # make lower case
    tokens = lower_case(tokens)

    # normalize tokens
    tokens = normalize_tokens(tokens)

    # remove zero words
    tokens = filter_short_tokens(tokens)
    
    # stem words
    tokens = stem_tokens(tokens)
    
    # convert to binary array of features  
    tokens_represented = represent_features(tokens, vocab)
    # END_CODE     
    
    return tokens_represented

In [62]:
preprocess(content, vocab)

Original len = 69
Remaining len = 61
44 word(s) from vocab are in the tokens.


array([0, 0, 0, ..., 0, 0, 0])

# Training and test sets

In [63]:
from scipy.io import loadmat

In [64]:
fn = os.path.join(path , 'spamTrain.mat')

mat = loadmat(fn)
X_train = mat['X']
y_train = mat['y'].ravel()

print(f'X_train.shape = {X_train.shape}')
print(f'y_train.shape = {y_train.shape}')

fn = os.path.join(path, 'spamTest.mat')
mat = loadmat(fn)
X_test = mat['Xtest']
y_test = mat['ytest'].ravel() 

print (f'X_test.shape = {X_test.shape}')
print (f'y_test.shape = {y_test.shape}')
index = 0 
print (f'Sample with index = {index}: \n{X_train[index]}')

X_train.shape = (4000, 1899)
y_train.shape = (4000,)
X_test.shape = (1000, 1899)
y_test.shape = (1000,)
Sample with index = 0: 
[0 0 0 ... 0 0 0]


# Training the model

In [65]:
C = .1
clf = LinearSVC(C=C)
clf.fit(X_train, y_train)
print(f'Score train = {clf.score(X_train,y_train)}')
print(f'Score test = {clf.score(X_test, y_test)}')

Score train = 0.99975
Score test = 0.992


# Determining most spam contributors

In [66]:
ser_coef = pd.Series(clf.coef_[0], index=vocab, name='Coef')
top_spam_contributors = ser_coef.sort_values(ascending=False).index[:20].tolist()

In [67]:
print(top_spam_contributors)

['our', 'remov', 'click', 'basenumb', 'guarante', 'visit', 'bodi', 'will', 'numberb', 'price', 'dollar', 'nbsp', 'below', 'lo', 'most', 'send', 'dollarnumb', 'credit', 'wi', 'hour']


# Use model for prediction

In [68]:
for sfn in [ 'emailSample1.txt', 'emailSample2.txt', 'spamSample1.txt', 'spamSample2.txt']:
    fn =  os.path.join(path,sfn)    
    content = get_sample(fn)
    
    # YOUR_CODE.  Preprocess the sample and get prediction 0 or 1 (1 is spam)
    # START_CODE
    prediction = clf.predict([preprocess(content, vocab)])
    # END_CODE    
    
    print ('{} is {}\n'.format(sfn, ('Not Spam','Spam')[prediction[0]]))

print ('Latter sample:\n{1}\n{0}\n{1}'.format(content, '='*50))

Original len = 69
Remaining len = 61
44 word(s) from vocab are in the tokens.
emailSample1.txt is Not Spam

Original len = 247
Remaining len = 222
122 word(s) from vocab are in the tokens.
emailSample2.txt is Not Spam

Original len = 141
Remaining len = 97
46 word(s) from vocab are in the tokens.
spamSample1.txt is Spam

Original len = 39
Remaining len = 31
18 word(s) from vocab are in the tokens.
spamSample2.txt is Spam

Latter sample:
Best Buy Viagra Generic Online

Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!

We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru



