In [1]:
#Importing required libraries
import numpy as np
import pandas as pd

In [2]:
#Reading from both training_dataset.csv and test_dataset.csv 
#to perform m-estimate Naive Bayes and test the classification accuracy
train_data = pd.read_csv('training_dataset_NB-Mounik.csv')
test_data = pd.read_csv('test_dataset_NB-Mounik.csv')

In [3]:
#Creating function for performing m-estimate Naive Bayes and calculating the classification accuracy
#Function will take training_dataset and test_dataset as input and provide the classification accuracy as output
def nb_me_model(traindf,testdf):
    
    #Splitting test_dataset in two datasets where test_x dataset will have all the features 
    #and test_y dataset will have all the class labels so that we can calculate the classification accuracy later 
    #by comparing the output class label (from model) and original class label (from test_y)
    testdf_x = testdf.iloc[:,:-1]
    testdf_y = testdf.iloc[:,-1]
    
    #Calculating prior probability of each class in training_dataset by counting the occurence of each class 
    #(2 classes - 'edible' and 'poisonous') and dividing it by total number of instances in training_dataset
    count_edible = len(traindf[traindf.iloc[:,-1] == 'edible'])
    count_poison = len(traindf[traindf.iloc[:,-1] == 'poisonous'])    
    prior_edible = count_edible/len(traindf)
    prior_poison = count_poison/len(traindf)
    
    #Creating nested empty dictionaries for storing the likelihood probabilities of each feature values for each class
    likelihood = {'edible':{},'poisonous':{}}
    
    #Iterating through each columns of training_dataset,except last column (class label) for calculating likelihood probability
    for col in traindf.columns[:-1]:
        
        #Creating further nested empty dictionaries for storing the likelihood probability of each value
        #of a feature while iterating through all the features in the training_dataset
        likelihood['edible'][col] = {}
        likelihood['poisonous'][col] = {}
        
        #Extracting unique values of each feature from training_dataset and storing them in the featval list
        featval = list(traindf[col].unique())
        
        #Considering the m as number of possible values of the feature for m-estimate Naive Bayes
        #Considering the p as uniform prior i.e (1 / number of possible values of the feature) for m-estimate Naive Bayes
        m = len(featval)
        p = 1/len(featval)
        
        #Iterating through each unique value of a feature for calculating likelihood probability
        for value in featval:
            
            #Calculating occurence of each value of feature for the given class label 
            count_fv_edible = len(traindf[(traindf[col] == value) & (traindf.iloc[:,-1] == 'edible')])
            count_fv_poison = len(traindf[(traindf[col] == value) & (traindf.iloc[:,-1] == 'poisonous')])
            
            #Calculating likelihood probability for each value of feature by adding m-estimate weight
            #and storing them in the likelihood dictionary in a given nested dictionary
            likelihood['edible'][col][value] = (count_fv_edible + (m*p))/(count_edible + m)
            likelihood['poisonous'][col][value] = (count_fv_poison + (m*p))/(count_poison + m)
            
        #Emptying featval list for storing the new unique values of next feature in next iteration   
        featval = []    
    
    #Creating empty list for storing the output class label of test_x dataset
    classification = []
    
    #Iterating through each row of test_x dataset for classifying through m-estimate Naive Bayes
    for row in range(0,len(testdf_x)):
        
        #Storing prior probabilities in two classify variables which will help in further calculation of posterior probability
        classify_edible = prior_edible
        classify_poison = prior_poison
        
        #Iterating through each feature in test_x dataset to calculate the posterior probability of each classes
        #by multiplying the already stored prior probability with likelihood of each feature value for a given class
        for feature in testdf_x.columns:
            classify_edible *= likelihood['edible'][feature][testdf_x[feature].iloc[row]]
            classify_poison *= likelihood['poisonous'][feature][testdf_x[feature].iloc[row]]
       
        #Comparing both posterior probabilities (2 classes - 'edible' and 'poisonous') and assigning
        #maximum probability to a given instance by appending the output class label in separate list
        if classify_edible > classify_poison:
            classification.append('edible')
        else:
            classification.append('poisonous')
    
    #Calculating accuracy by comparing test_y dataset (original class label) with classification list (output class label)
    #Initializing variable to count the total number of correct class labels while doing comparison
    count_correct_class = 0
    
    #Iterating through each row in test_y dataset
    for i in range(len(testdf_y)):
        
        #Comparing both original class label (from test_y dataset) and output class label (from classification list)
        #If matches, count of correct class label will be incremented by 1
        if testdf_y[i] == classification[i]:
            count_correct_class += 1
    
    #Printing overall accuracy by calculating percentage of total number of correct class labels 
    #from total number of class labels in a test dataset
    print('Overall Accuracy of m-estimate Naive Bayes model:', (count_correct_class/len(testdf_y))*100)

In [4]:
#Passing both training_dataset and test_dataset to nb_me_model function 
#to check the overall accuracy of m-estimate Naive Bayes model
nb_me_model(train_data,test_data)

Overall Accuracy of m-estimate Naive Bayes model: 94.0
