### Naive Bayes Classifier - Spam mail Detection

In [54]:
#import all the Library
import os
import re
import numpy
from glob import glob
from math import log

# File path for training and test file
ham_path = "./ham/*.words"
spam_path = "./spam/*.words"
test_path = "./test/*.words"
truthPath = "./truthfile"

# Variable declaration and intialization
spam_dict, ham_dict ={} , {}
spam_mail_count, ham_mail_count = 0, 0
total_spam_words, total_ham_words = 0, 0
pred_spam_count, pred_ham_count = 0,0
correct_pred_spam_count, correct_pred_ham_count = 0,0

#Hyperparameters for model
alpha = 0.001
Vocab_size = 2000000


# Read each mail from Spam folder, calculate the number of occurences of different word found in mail
# and construct Spam dictionary with Key as words and Value as no. of occurences.
for inFile in glob(spam_path):
    with open(inFile,'r') as fopen:
        spam_mail = (fopen.read().split("\n"))  
        spam_mail_count += 1                    
        for word in spam_mail:
            if word not in spam_dict:
                spam_dict[word]=1
            else:
                spam_dict[word]=spam_dict[word]+1
                

# Read each mail from Spam folder, calculate the number of occurences of different word found in mail
# and construct Ham dictionary with Key as words and Value as no. of occurences.            
for inFile in glob(ham_path):
    with open(inFile,'r') as fopen:
        ham_mail = (fopen.read().split("\n"))   
        ham_mail_count += 1                    
        for word in ham_mail:
            if word not in ham_dict:
                ham_dict[word]=1
            else:
                ham_dict[word]=ham_dict[word]+1
                
# Total no. of mail in Spam and Ham folder
print('No. of Spam mail in training data : ' + str(spam_mail_count))
print('No. of Ham mail in training data : '  + str(ham_mail_count))


# Calculate the probability of mail to be Spam or Ham 
P_Spam = (spam_mail_count /(spam_mail_count+ham_mail_count))
P_Ham = (ham_mail_count /(spam_mail_count+ham_mail_count))
print('P(Spam) : ' + str(P_Spam))
print('P(Ham) : '  + str(P_Ham))


# Total no. of word in Spam and Ham dictionary constructed from training data and Vocabulary size
spam_dict_count, ham_dict_count = sum(list(spam_dict.values())), sum(list(ham_dict.values()))
print('Total no. word in spam dictionary : '+ str(spam_dict_count))
print('Total no. word in ham dictionary : ' + str(ham_dict_count))
print('Vocabulary Size : '+str(Vocab_size))


No. of Spam mail in training data : 500
No. of Ham mail in training data : 500
P(Spam) : 0.5
P(Ham) : 0.5
Total no. word in spam dictionary : 79588
Total no. word in ham dictionary : 82684
Vocabulary Size : 2000000


In [55]:
# define a function which will classify the given mail based on the Bayes theorem formula
def classify( mail):
    prob_word_if_spam = 0
    prob_word_if_ham = 0
 
   # Calculate the probability of words if given mail is spam with respect to spam dictionary
    for word in mail:
        if word in spam_dict:
            count = spam_dict[word]             
        else:
            count = 0       
        prob_word_if_spam += log((count+alpha)/(spam_dict_count + alpha * Vocab_size))
        
    #Probability of mail to be spam for given words in mail        
    Prob_Spam = log(P_Spam) + prob_word_if_spam 

     # Calculate the probability of words if given mail is ham with respect to ham dictionary              
    for word in mail:
        if word in ham_dict:
            count = ham_dict[word]
        else:
            count = 0
        prob_word_if_ham += log((count+alpha)/(ham_dict_count + alpha * Vocab_size))
        
    #Probability of mail to be ham for given words in mail 
    Prob_Ham = log(P_Ham) + prob_word_if_ham 

    
# Compare both the probability and predict the class:     
    if Prob_Spam > Prob_Ham :
        return('Spam')
        
    else:
        return('Ham')

In [56]:
# Lets evaluate each mail from test folder as spam or Ham using above classify() function
print("Mail No.  Predicted class   Actual class ")
for inFile in glob(test_path):
    with open(inFile,'r') as fopen:
        # Extracting only the filename
        mail_no = os.path.splitext(os.path.basename(inFile))[0]
        print(' '+mail_no, end="")
        test_mail = (fopen.read().split("\n")) 
        
        # calls classify function to predict the class
        pred_class = classify(test_mail)
        if pred_class == 'Spam':
            pred_spam_count += 1
        else:
            pred_ham_count += 1
            
        print('            ' + pred_class + '           ',end="")
        
        # determine whether the given mail is actually Spam or Ham by using Truth file
        for InFile in glob(truthPath):
            with open(InFile,'r') as fopen:
                truth_file_cont = (fopen.read().split("\n")) 
                if mail_no in truth_file_cont:
                    print('  Spam  ') 
                    if pred_class == 'Spam':
                        correct_pred_spam_count += 1                        
                else:
                    print('  Ham  ')
                    if pred_class == 'Ham':
                        correct_pred_ham_count += 1

Mail No.  Predicted class   Actual class 
 89            Ham             Ham  
 74            Spam             Ham  
 31            Spam             Spam  
 49            Ham             Ham  
 90            Ham             Ham  
 28            Spam             Spam  
 50            Spam             Spam  
 15            Spam             Spam  
 9            Ham             Ham  
 100            Ham             Ham  
 52            Ham             Ham  
 17            Spam             Spam  
 92            Ham             Ham  
 76            Spam             Ham  
 33            Ham             Ham  
 72            Spam             Ham  
 37            Spam             Spam  
 56            Ham             Ham  
 13            Spam             Spam  
 96            Ham             Ham  
 69            Spam             Ham  
 94            Ham             Ham  
 54            Ham             Ham  
 11            Ham             Ham  
 70            Spam             Ham  
 35           

In [57]:
# Calculate the Actual and correctly predicted Spam mail as well as Ham Mail
actual_spam_count = len(truth_file_cont)
total_test_mail = pred_spam_count + pred_ham_count
actual_ham_count  = total_test_mail - actual_spam_count
total = pred_spam_count + pred_ham_count

print('Actual no. of Spam mail :'+ str(actual_spam_count))
print('Actual no. of Ham mail :'+ str(actual_ham_count))
print('Predicted no. of Spam mail :'+ str(pred_spam_count))
print('Predicted no. of Ham mail :'+ str(pred_ham_count))
print('Correctly predicted no. of Spam mail :'+ str(correct_pred_spam_count))
print('Correctly predicted no. of Ham mail :'+ str(correct_pred_ham_count)+'\n')


# Construct the confusion matrix using above values:
TP = correct_pred_spam_count
FN = actual_spam_count - correct_pred_spam_count
FP = pred_spam_count - correct_pred_spam_count
TN = correct_pred_ham_count

confusion_matrix = numpy.array([[TP,FN],[FP,TN]])
print('confusion_matrix :')
print(confusion_matrix)

# Displaying the Accuracy, Precision ,Recall and F-score to evaluate the performance of model for testing mail
Accuracy = (correct_pred_spam_count + correct_pred_ham_count) / (total_test_mail) * 100
Precision_spam = round((correct_pred_spam_count / pred_spam_count * 100),2)
Recall_spam = round((correct_pred_spam_count / actual_spam_count * 100),2)
F_score_spam = round(((2 * Precision_spam * Recall_spam) / (Precision_spam + Recall_spam)),2)

print ('\n'+ "Accuracy: ", str(Accuracy), 'percent')
print ("Precision: ", str(Precision_spam), 'percent')
print ("Recall: ", str(Recall_spam), 'percent')
print ("F-score: "+ str(F_score_spam)) 

Actual no. of Spam mail :37
Actual no. of Ham mail :63
Predicted no. of Spam mail :49
Predicted no. of Ham mail :51
Correctly predicted no. of Spam mail :36
Correctly predicted no. of Ham mail :50

confusion_matrix :
[[36  1]
 [13 50]]

Accuracy:  86.0 percent
Precision:  73.47 percent
Recall:  97.3 percent
F-score: 83.72
