### Naive bayes classifier with smoothing

An implementation of Naive Bayes classifier with smoothing to solve the zero frequency problem. <br>
The classifier is applied on the iris data set from the <a href="https://archive.ics.uci.edu/ml/datasets/iris">UCI Repository</a>. 

In [38]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [39]:
#grouping data points with respect to their classes to calculate the prior possibilities for each class. 
def calculate_class_prior(train_set):
    split_by_class = train_set.groupby('iris')
    prior_dic = {}
    for class_name, split_data in split_by_class:
        prior_dic[class_name] = len(split_data)/len(train_set)
    return prior_dic

In [40]:
def likelihood(train_set, feature_name):
    conditional_split_data = train_set.groupby('iris')[feature_name].value_counts()
    conditional_split_dic = conditional_split_data.to_dict()
    return  conditional_split_dic

In [43]:
def likelihood_with_smoothing(spit_dictionary):

    conditional_split_probs = {}
    for i in spit_dictionary.keys():
        k = spit_dictionary[i]
        conditional_split_probs[i] = spit_dictionary[i] + (4 * (1/k)) /105 + 4  ## smoothing possibilities, 4 is the number 
                                                                                ## of features, (1/k) is 1/number of classes
    return conditional_split_probs

In [46]:
def multiplyList(myList) : 
      
    # Multiply elements one by one 
    result = 1
    for x in myList: 
         result = result * x  
    return result

In [47]:
def prediction(train_Set, test_Set):
    import numpy as np
    import operator
    predicted = []
    prior_dict = calculate_class_prior(train_Set)
    numpy_data = test_Set.values
    for x in np.nditer(numpy_data, flags = ['external_loop'], op_flags = ['readwrite'] ,order = 'C'):
        posterior = {}
#         print(x)
        for i in range(1, 4):   #itertate over classes
#             print(i)
            probs = []
            for j in range(4):
                feature_name = test_Set.columns[j]
                feature_value = x[j]
                dict_tuple = (i, feature_value)
#                 print(dict_tuple)
                likelihood_dic_for_smooth = likelihood(train_Set, feature_name)
                likelihood_dict = likelihood_with_smoothing(likelihood_dic_for_smooth)
                if dict_tuple in likelihood_dict.keys():
                    probs.append(likelihood_dict[dict_tuple])
                else:
                    probs.append(0)

#             print(probs)
#             print(multiplyList(probs))
            posterior[i] = multiplyList(probs) * prior_dict[i]

#         print(posterior)
        argmax = max(posterior, key=posterior.get)
#         print("CLASSS IS: ", argmax)
        predicted.append(argmax)
    actual_labels = numpy_data[:, -1:]
    actual_labels = np.reshape(actual_labels, (1,-1))
    return predicted, actual_labels.tolist()



In [48]:
def calculate_accuracy(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [49]:
def split_train_test(dataframe, split_ratio=0.7):
    index = round(len(dataframe) * split_ratio)
    train_data = dataframe.iloc[:index, :]
    test_data = dataframe.iloc[index:, :]
    return train_data, test_data


In [54]:
def main():
    df = pd.read_csv("...\\iris.csv")
    df =  shuffle(df, random_state=55)
    train_data, test_data = split_train_test(df)
    predictions_of_algorithm, actual  = prediction(train_data, test_data)
    accuracy = calculate_accuracy(actual[0],predictions_of_algorithm)
    print('accuracy of algorithm is: ', accuracy)

In [55]:
main()

accuracy of algorithm is:  73.33333333333333


Using smoothing process is a solution to zero frequency problem in Naive Bayes process. zero frequency problem occures when the categorical variable is not observed in the training set.  <br>
Smoothing increases Accuracy measure in comparison to previous implementation of Naive Bayes algorithm on the same dataset, 