In [2]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix


In [3]:
#Dictionary can be seen by the command print dictionary. You may find some absurd word counts to be high but it’s just a dictionary and you always have the scope of  improving it later.
def make_Dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]    
    all_words = []       
    for mail in emails:    
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:
                    words = line.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    
    list_to_remove = list(dictionary.keys())
    for item in list_to_remove:
        if item.isalpha() == False: 
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    return dictionary
    

In [4]:
#Each word count vector contains the frequency of words in the training file. 
def extract_features(mail_dir): 
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    docID = 0;
    for fil in files:
      with open(fil) as fi:
        for i,line in enumerate(fi):
          if i == 2:
            words = line.split()
            for word in words:
              wordID = 0
              for i,d in enumerate(dictionary):
                if d[0] == word:
                  wordID = i
                  features_matrix[docID,wordID] = words.count(word)
        docID = docID + 1     
    return features_matrix

In [5]:
# Create a dictionary of words with its frequency
# directory 
train_dir = 'E:\\data\\ling-spam\\train-mails'
dictionary = make_Dictionary(train_dir)

# Prepare feature vectors per training mail and its labels



In [6]:
train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)

In [7]:
train_matrix


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [8]:
train_labels

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

Naive Bayes classifier is a conventional and very popular method for document classification problem. It is a supervised probabilistic classifier based on Bayes theorem assuming independence between every pair of features. 

SVMs are supervised binary classifiers which are very effective when you have higher number of features. The goal of SVM is to separate some subset of training data from rest called the support vectors (boundary of separating hyper-plane). The decision function of SVM model that predicts the class of the test data is based on support vectors and makes use of a kernel trick.

In [9]:

# Training SVM and Naive bayes classifier and its variants

model1 = LinearSVC()

model1.fit(train_matrix,train_labels)

# Test the unseen mails for Spam





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [10]:
model2 = MultinomialNB()
model2.fit(train_matrix,train_labels)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Once the classifiers are trained, we can check the performance of the models on test-set. We extract word count vector for each mail in test-set and predict its class(ham or spam) with the trained NB classifier and SVM model.

In [13]:
test_dir = 'E:\\data\\ling-spam\\test'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(360)
test_labels[230:360] = 1

result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)


In [14]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_matrix,train_labels)
# Output of the score is the accuracy of the prediction
# Accuracy: 0.995
x3=clf.score(train_matrix,train_labels)
print('Accuracy:',x3)


Accuracy: 0.9814814814814815


In [15]:
x1=model1.score(train_matrix,train_labels)
x2=model2.score(train_matrix,train_labels)
x3=clf.score(train_matrix,train_labels)
print('Linear SVC model- ',x1)
print('Multinomial Naive Bayes- ',x2)
print('Gaussian Naive Bayes- ',x3)

Linear SVC model-  1.0
Multinomial Naive Bayes-  0.9886039886039886
Gaussian Naive Bayes-  0.9814814814814815


In [16]:
x1=model1.score(test_matrix,test_labels)
x2=model2.score(test_matrix,test_labels)
x3=clf.score(test_matrix,test_labels)
print('Accuracy of Models')
print('Linear SVC model- ',x1)
print('Multinomial Naive Bayes- ',x2)
print('Gaussian Naive Bayes- ',x3)

Accuracy of Models
Linear SVC model-  0.9722222222222222
Multinomial Naive Bayes-  0.9722222222222222
Gaussian Naive Bayes-  0.9666666666666667


The diagonal elements represents the correctly identified(a.k.a. true identification) mails where as non-diagonal elements represents wrong classification (false identification) of mails.

In [17]:
result3 =clf.predict(test_matrix)
print('Confusion Matrix of Models')
print('Linear SVC model- \n',confusion_matrix(test_labels,result1))
print('Multinomial Naive Bayes- \n',confusion_matrix(test_labels,result2))
print('Gaussian Naive Bayes- \n',confusion_matrix(test_labels,result3))

Confusion Matrix of Models
Linear SVC model- 
 [[226   4]
 [  6 124]]
Multinomial Naive Bayes- 
 [[229   1]
 [  9 121]]
Gaussian Naive Bayes- 
 [[228   2]
 [ 10 120]]


In [18]:
from sklearn.metrics import average_precision_score
score1 = model1.decision_function(test_matrix)
average_precision1 = average_precision_score(test_labels, score1)


print('Average precision-recall score')
print('Linear SVC model : {0:0.2f}%'.format(
      average_precision1))


Average precision-recall score
Linear SVC model : 0.99%


In [19]:
import nltk.metrics

In [20]:
cfl=confusion_matrix(test_labels,result1)
cfm=confusion_matrix(test_labels,result2)
cfg=confusion_matrix(test_labels,result3)

In [21]:
def prec(x):
    return x[1][1]/(x[1][1]+x[0][1])
def rec(x):
    return x[1][1]/(x[1][1]+x[1][0])

In [22]:
print('Precision-Recall score')
print('Linear SVC model : Precision- {0:0.2f}%'.format(
      prec(cfl)*100),' Recall- {0:0.2f}%'.format(
      rec(cfl)*100))
print('Multinomial Naive Bayes model : Precision- {0:0.2f}%'.format(
      prec(cfm)*100),' Recall- {0:0.2f}%'.format(
      rec(cfm)*100))
print('Gaussian Naive Bayes model : Precision- {0:0.2f}%'.format(
      prec(cfg)*100),' Recall- {0:0.2f}%'.format(
      rec(cfg)*100))

Precision-Recall score
Linear SVC model : Precision- 96.88%  Recall- 95.38%
Multinomial Naive Bayes model : Precision- 99.18%  Recall- 93.08%
Gaussian Naive Bayes model : Precision- 98.36%  Recall- 92.31%


Now comparison

In [23]:
#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
print('Linear SVC model')
print(classification_report(test_labels,result1 ))
print('Multinomial Naive Bayes')
print(classification_report(test_labels,result2 ))
print('Gaussian Naive Bayes')
print(classification_report(test_labels,result3 ))


Linear SVC model
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98       230
         1.0       0.97      0.95      0.96       130

   micro avg       0.97      0.97      0.97       360
   macro avg       0.97      0.97      0.97       360
weighted avg       0.97      0.97      0.97       360

Multinomial Naive Bayes
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       230
         1.0       0.99      0.93      0.96       130

   micro avg       0.97      0.97      0.97       360
   macro avg       0.98      0.96      0.97       360
weighted avg       0.97      0.97      0.97       360

Gaussian Naive Bayes
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97       230
         1.0       0.98      0.92      0.95       130

   micro avg       0.97      0.97      0.97       360
   macro avg       0.97      0.96      0.96       360
weighted avg