In [1]:
import numpy as np

In [2]:
def fit(X_train,Y_train):
    all_classes=set(Y_train)
    result={}
    result["total_data"]=len(X_train)
    for current_class in all_classes:
        result[current_class]={}
        current_class_rows=X_train[Y_train==current_class]
        result[current_class]["total_data"]=len(current_class_rows)
        num_features=X_train.shape[1]
        for i in range(1,num_features+1):
            result[current_class][i]={}
            all_labels=set(X_train[:,i-1]) # check
            for label in all_labels:
                result[current_class][i][label]=(current_class_rows[:,i-1]==label).sum()
    return result

In [12]:
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_data"]) - np.log(dictionary["total_data"])
    num_features = len(dictionary[current_class].keys()) - 1;
    for j in range(1, num_features + 1):
        xj = x[j - 1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]["total_data"] + len(dictionary[current_class][j].keys())
        current_xj_probablity = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
        output = output + current_xj_probablity
    return output

In [4]:
def predict_single_point(dictionary,x):
    all_classes=dictionary.keys()
    best_probability=-1000
    best_class=-1
    first_run=True
    for current_class in all_classes:
        if current_class=="total_data":
            continue
        current_class_probability=probability(dictionary,x,current_class)
        if first_run or current_class_probability > best_probability:
            best_probability=current_class_probability
            best_class=current_class
        first_run=False
    return best_class

In [5]:
def predict(dictionary,X_test):
    Y_pred=[]
    for x in X_test:
        y=predict_single_point(dictionary,x)
        Y_pred.append(y)
    return Y_pred

# Data conversion into labels

In [6]:
def tolabel(column):
    second_limit=column.mean()
    first_limit=second_limit/2
    third_limit=second_limit*3/2
    for i in range (0,len(column)):
        if (column[i] < first_limit):
            column[i] = 0
        elif (column[i] < second_limit):
            column[i] = 1
        elif(column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [7]:
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
Y=iris.target

In [8]:
for i in range(X.shape[1]):
    X[:,i]=tolabel(X[:,i])

In [9]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.25,random_state=0)

In [10]:
dictionary = fit(X_train,Y_train)

In [13]:
Y_pred = predict(dictionary,X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

   micro avg       0.97      0.97      0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


# Comparision from inbuilt Gaussian NB

In [15]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

   micro avg       0.87      0.87      0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


In [32]:
import pandas as pd
pd.DataFrame(X_train).std()[2]

1.1051170905310368