### Naive Bayes Classifier

Implementation of Naive Bayes classifier on the iris dataset from <a href="https://archive.ics.uci.edu/ml/datasets/iris">UCI Repository</a>. 

In [10]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
df = pd.read_csv("...\\iris.csv")
df =  shuffle(df, random_state=42)

In [11]:
def calculate_class_prior(train_set):
    split_by_class = train_set.groupby('iris')
    prior_dic = {}
    for class_name, split_data in split_by_class:
        prior_dic[class_name] = len(split_data)/len(train_set)
    return prior_dic

In [12]:
def likelihood(train_set, feature_name):
    conditional_split_data = train_set.groupby('iris')[feature_name].value_counts()
    conditional_split_dic = conditional_split_data.to_dict()
    conditional_split_probs = {}
    #calculate the probability of each class based on condition: variable == feature_name
    for i in conditional_split_dic.keys(): 
        conditional_split_probs[i] = conditional_split_dic[i]/len(train_set)
    return conditional_split_probs, conditional_split_dic #returns

In [13]:
def multiplyList(myList) :  
    result = 1
    for x in myList: 
         result = result * x  
    return result

In [14]:
def prediction(train_Set, test_Set):
    import numpy as np
    import operator
    predicted = []
    prior_dict = calculate_class_prior(train_Set)
    numpy_data = test_Set.values
    for x in np.nditer(numpy_data, flags = ['external_loop'], op_flags = ['readwrite'] ,order = 'C'):
        posterior = {}
#         print(x)
        for i in range(1, 4):   #itertate over classes
#             print(i)
            probs = []
            for j in range(4):
                feature_name = test_Set.columns[j]
                feature_value = x[j]
                dict_tuple = (i, feature_value) ## a tuple ==> (class,number of samples in the feature with class i)
#                 print(dict_tuple)
                likelihood_dict, _ = likelihood(train_Set, feature_name)
                if dict_tuple in likelihood_dict.keys():
                    probs.append(likelihood_dict[dict_tuple])
                else:
                    probs.append(0)

#             print(probs)
#             print(multiplyList(probs))
            posterior[i] = multiplyList(probs) * prior_dict[i] #multiply prior and likelihood to get posetrior

#         print(posterior)
        argmax = max(posterior, key=posterior.get)
#         print("CLASSS IS: ", argmax)
        predicted.append(argmax)
    actual_labels = numpy_data[:, -1:]
    actual_labels = np.reshape(actual_labels, (1,-1))
    return predicted, actual_labels.tolist()



In [15]:
def calculate_accuracy(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [16]:
# test_data = df.iloc[105:, :]
# train_data = df.iloc[:105, :]

In [33]:
def split_train_test(dataframe, split_ratio=0.7):
    index = round(len(dataframe) * split_ratio)
    train_data = dataframe.iloc[:index, :]
    test_data = dataframe.iloc[index:, :]
    return train_data, test_data


In [34]:
predictions_of_algorithm, actual  = prediction(train_data, test_data)

In [35]:
calculate_accuracy(actual[0],predictions_of_algorithm)

53.333333333333336

low accuracy is an indication that Naive Bayes classifier is not an ideal candidate for non-categorical variables as the probability of zero frequency problem increases in non-categorical datasets. 