In [3]:
from pathlib import Path
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import glob
import os

In [4]:
easy_ham = []
hard_ham = []
spam = []

def get_files(dir):
    files = []
    for filename in glob.glob(dir+'\*'):
        try:
            with open(os.path.join(os.getcwd(), filename), 'r') as f:
                files.append(f.read())
        except:
            print("Invalid file from {}, encoding error".format(dir))
            
    print('{} files found in {}'.format(len(files), dir))
    return np.array(files)
            
x_easy_ham = get_files('easy_ham')
x_hard_ham = get_files('hard_ham')
x_spam = get_files('spam')

2551 files found in easy_ham
250 files found in hard_ham
Invalid file from spam, encoding error
Invalid file from spam, encoding error
Invalid file from spam, encoding error
Invalid file from spam, encoding error
Invalid file from spam, encoding error
496 files found in spam


In [None]:
def remove_header(mail):
    # Filter all msg before Cc: if present
    mail = mail.split('Cc:')
    if (len(mail) == 1): # removed header did not worked, continuying the filtering
        pass
    else:
        mail = mail[1]
        
    print(len(mail))
    print(mail)

In [None]:
# Prepocessing data : filter header
import re
str_ = x_easy_ham[0]
remove_header(str_)


In [5]:
x_ham = np.concatenate((x_easy_ham, x_hard_ham))
y_ham = np.ones(len(x_ham))
y_spam = np.ones(len(x_spam)) -2

# Run program on easy ham vs spam
x_ham_spam = np.concatenate((x_ham, x_spam))
y_ham_spam = np.concatenate((y_ham, y_spam))

x_easy_spam = np.concatenate((x_easy_ham, x_spam))
x_hard_spam = np.concatenate((x_hard_ham, x_spam))

x_train, x_test, y_train, y_test = train_test_split(x_ham_spam, y_ham_spam, test_size=0.25, random_state=16)

# Run program on hard ham vs spam
#hamtrain, hamtest = train_test_split(x_easy_ham, y_ham, test_size=0.3)
#spamtrain, spamtest = train_test_split(x_spam, y_spam, test_size=0.3)


In [None]:
x_train

In [6]:
# Vectorization

# vectorize the documents based on the vocabulary given by the ham and pam
vectorizer = CountVectorizer()

# tokenize
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)                                   


In [7]:
# Trainning on ham easy and spam and hyper optimization

param_grid_nb_multi = {
    'alpha': [1,0.1,0.01,0.001,0.0001,0]
}
param_grid_nb_bernoulli = {
    'alpha': [1,0.1,0.01,0.001,0.0001,0],
    'binarize': [1, 0.8, 0.5, 0.3, 0.1, 0.001]
}

print("[*] Starting hyperoptimization for Multinomial ..")
nbMultinomial_grid = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid_nb_multi, verbose=1, cv=10, n_jobs=-1).fit(x_train_vec, y_train)
print("[!] Best estimator {} ".format(nbMultinomial_grid.best_estimator_))

print("[*] Starting hyperoptimization for Bernoulli ..")
nbBernoulli_grid = GridSearchCV(estimator=BernoulliNB(), param_grid=param_grid_nb_bernoulli, verbose=1, cv=10, n_jobs=-1).fit(x_train_vec, y_train)
print("[!] Best estimator {} ".format(nbBernoulli_grid.best_estimator_))

multinomial_nb = nbMultinomial_grid.best_estimator_
bernoulli_nb = nbBernoulli_grid.best_estimator_



[*] Starting hyperoptimization for Multinomial ..
Fitting 10 folds for each of 6 candidates, totalling 60 fits
[!] Best estimator MultinomialNB(alpha=0.1) 
[*] Starting hyperoptimization for Bernoulli ..
Fitting 10 folds for each of 36 candidates, totalling 360 fits
[!] Best estimator BernoulliNB(alpha=0.001, binarize=1) 


In [None]:
multinomial_nb = MultinomialNB().fit(x_train_vec, y_train)
bernoulli_nb = BernoulliNB(binarize=1).fit(x_train_vec, y_train)

In [8]:
def get_accuracy(model, x_test_vec_, y_test_):
    acc = 0
    for i in range(0, x_test_vec_.shape[0]):
        sample = x_test_vec_[i, :]
        pred = model.predict(sample)
        if pred == y_test_[i]:
            acc+=1/x_test_vec_.shape[0]
    return acc

In [11]:
def get_info(model, x_test_vec_, y_test_):
    y_pred = []
    for i in range(0, x_test_vec_.shape[0]):
        sample = x_test_vec_[i, :]
        pred = model.predict(sample)
        y_pred.append(pred)
    y_pred = np.array(y_pred)
    print(confusion_matrix(y_test_, y_pred), ": is the confusion matrix")
    print(round(accuracy_score(y_test_, y_pred),4), ": is the accuracy score")
    print(round(precision_score(y_test_, y_pred), 4), ": is the precision score")
    print(round(recall_score(y_test_, y_pred), 4), ": is the recall score")
    print(round(f1_score(y_test_, y_pred), 4), ": is the f1 score") # f1 is a mix of accuracy and recall
    

In [12]:
#i = 40
#sample = x_test_vec[i, :]
#pred = multinomial_nb.predict(sample)

#acc = get_accuracy(multinomial_nb, x_test_vec, y_test)
#acc_bernoulli = get_accuracy(bernoulli_nb, x_test_vec, y_test)

print("Accuracy for multinomal All-Ham vs Spam ")
get_info(multinomial_nb, x_test_vec, y_test)
print("\n\nAccuracy for bernoulli All-Ham vs Spam")
get_info(bernoulli_nb, x_test_vec, y_test)


Accuracy for multinomal All-Ham vs Spam 
[[117   4]
 [  3 701]] : is the confusion matrix
0.9915 : is the accuracy score
0.9943 : is the precision score
0.9957 : is the recall score
0.995 : is the f1 score


Accuracy for bernoulli All-Ham vs Spam
[[116   5]
 [  1 703]] : is the confusion matrix
0.9927 : is the accuracy score
0.9929 : is the precision score
0.9986 : is the recall score
0.9958 : is the f1 score


In [None]:

print("Accuracy for multinomal All-Ham vs Spam ")
get_info(multinomial_nb, x_test_vec, y_test)
print("\n\nAccuracy for bernoulli All-Ham vs Spam")
get_info(bernoulli_nb, x_test_vec, y_test)

In [None]:
# 4 