In [1]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [2]:
def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)
    #print(list_to_remove)
    for item in list_to_remove:
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    
    return dictionary

In [3]:
def extract_features(mail_dir):
    files = [os.path.join(mail_dir, fi) for fi in os.listdir(mail_dir)]
    #print(np.zeros(len(files), 3000))
    features_matrix = np.zeros((len(files), 3000), dtype = int)
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i, line in enumerate(fi):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                #print(features_matrix[docID, wordID])
                                features_matrix[docID, wordID] = words.count(word)
        train_labels[docID] = 0;
        filepathTokens = fil.split('/')
        lastToken = filepathTokens[len(filepathTokens) - 1]
        #print(lastToken)
        if lastToken.find("spmsg") > -1:
            train_labels[docID] = 1
            count = count +  1
        docID = docID + 1
    return features_matrix, train_labels

In [4]:
TRAIN_DIR = "../data/raw/train-mails"
TEST_DIR = "../data/raw/test-mails"
dictionary = make_Dictionary(TRAIN_DIR)

In [5]:
print("reading and processing emails from file.")
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)

reading and processing emails from file.


**Basically, sklearn Naive Bayes provides three alternatives for model training:**

### Gaussian

**It is used in classification and it assumes that features follow a normal distribution.**

In [6]:
model = GaussianNB()

print("Training model.")
#train model
model.fit(features_matrix, labels)

Training model.


GaussianNB(priors=None)

In [62]:
predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

FINISHED classifying. accuracy score : 
0.9615384615384616


In [8]:
labels

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

### Multinomial: 

It is used for discrete counts. For example, let’s say, we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

In [63]:
model = MultinomialNB()

print("Training model.")
#train model
model.fit(features_matrix, labels)

Training model.


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

FINISHED classifying. accuracy score : 
0.9615384615384616


### Bernoulli:

The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

In [65]:
model = BernoulliNB()

print("Training model.")
#train model
model.fit(features_matrix, labels)

Training model.


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [66]:
predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

FINISHED classifying. accuracy score : 
0.7730769230769231


### SVM 


In [67]:
from sklearn import svm
model = svm.SVC()
print("Training model.")
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)
print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.
FINISHED classifying. accuracy score : 
0.8153846153846154


In [68]:
'sd'

'sd'