In [1]:
#importing python libraries
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.metrics import accuracy_score

# IMPLEMENTING NAIVE BAYES FROM SCRATCH


In [2]:
#fit function
#we made a 3 level dictionary of dictionaries
#in the first level we stored the class
#in the second , we stored the feature
#in the third , we stored the possible values of feature ,  and corresponding count of those features
def fit(X_train , Y_train):
    res = {}
    classes = set(Y_train)
    for ai in classes:
        res[ai] = {}
        true_vals = (Y_train == ai)
        res['total_data'] = len(Y_train)
        X_train_class = X_train[true_vals]
        Y_train_class = Y_train[true_vals]
        res[ai]['cnt'] = len(Y_train_class)
        num_features = X_train.shape[1]
        for j in range(1 , num_features + 1):
            res[ai][j] = {}
            distinct_values = set(X_train[: , j - 1])
            for val in distinct_values:
                res[ai][j][val]  = (X_train_class[: , j - 1] == val).sum()
    return res

In [3]:
#calculating probability for one point

def probability(res , x , class_value):
    output = np.log(res[class_value]['cnt'])  -  np.log(res['total_data'])
    num_features = len(res[class_value].keys()) - 1
    for j in range(1 , num_features + 1):
        xj = x[j - 1]
        num_with_ai_and_data_xj = res[class_value][j][xj] + 1
        class_count_curr = res[class_value]['cnt'] + len(res[class_value][j].keys())
        prob = np.log(num_with_ai_and_data_xj) - np.log(class_count_curr)
        output = output + prob
    return output

In [4]:
def predict_for_one_data(res , x):
    best_probability = -1000
    best_class = -1
    classes = res.keys()
    run = True
    for cls in classes:
        if cls == 'total_data':
            continue
        prob = probability(res , x , cls)
        if run or best_probability < prob :
            best_probability = prob
            best_class = cls
        run = False
    return best_class

In [5]:
#predict function
def predict(res , x):
    y_pred = []
    for xj in x:
        single_class = predict_for_one_data(res , xj)
        y_pred.append(single_class)
    return y_pred

In [6]:
iris = datasets.load_iris()

In [7]:
X = iris.data
Y = iris.target

In [8]:
#making iris data as labelled
def makelabelled(columns):
    second_div = columns.mean()
    first_div = 0.5*second_div
    third_div = 1.5*second_div
    for i in range(len(columns)) :
        if columns[i] < first_div :
            columns[i] = 0
        elif columns[i] < second_div :
            columns[i] = 1
        elif columns[i] < third_div :
            columns[i] = 2
        else:
            columns[i] = 3
    return columns

In [9]:
for i in range(0 , X.shape[-1]):
    X[: , i] = makelabelled(X[: , i])

In [10]:
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test = train_test_split(X ,Y ,random_state = 0)

In [11]:
res = fit(X_train , Y_train)

In [12]:
Y_pred = predict(res , X_test)

In [13]:
# accuracy of our algo
print(classification_report(Y_pred , Y_test))
print(confusion_matrix(Y_pred , Y_test))
print(accuracy_score(Y_pred , Y_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      0.94      0.97        17
          2       0.89      1.00      0.94         8

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  1]
 [ 0  0  8]]
0.9736842105263158


In [14]:
#implementing gaussian naive bayes and comaring accuracy

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train , Y_train)
prediction = clf.predict(X_test)
print(classification_report(prediction , Y_test))
print(confusion_matrix(prediction , Y_test))
print(accuracy_score(prediction , Y_test))

             precision    recall  f1-score   support

          0       0.85      1.00      0.92        11
          1       1.00      0.76      0.86        21
          2       0.67      1.00      0.80         6

avg / total       0.90      0.87      0.87        38

[[11  0  0]
 [ 2 16  3]
 [ 0  0  6]]
0.868421052631579
