In [1]:
import numpy as np

In [2]:
def fit(X_train, y_train):
    result = {}
    class_values = set(y_train)
    for curr_class in class_values:
        result[curr_class] = {}
        result['total_data'] = len(y_train)
        current_class_rows = (y_train == curr_class)
        X_train_curr = X_train[current_class_rows]
        y_train_curr = y_train[current_class_rows]
        result[curr_class]['total_count'] = len(y_train_curr)
        num_features = X_train.shape[1]
        for j in range(1,num_features+1):
            j_1 = j-1
            result[curr_class][j] = {}
            all_possible_values = set(X_train[:,j_1])
            for curr_value in all_possible_values:
                result[curr_class][j][curr_value] = (X_train_curr[:,j_1] == curr_value).sum()
    return result

In [3]:
def probability(dictionary,x,curr_class):
    output = np.log(dictionary[curr_class]['total_count'])-np.log(dictionary['total_data'])
    num_features = len(dictionary[curr_class].keys())-1
    for j in range(1,num_features+1):
        xj = x[j-1]
        count_curr_class_with_value_xi = dictionary[curr_class][i][xj] +1
        count_curr_class = dictionary[curr_class]['total_count']+len(dictionary[curr_class][j].keys())
        current_xj_prob = np.log(count_curr_class_with_value_xi)-np.log(count_curr_class)
        output = output + current_xj_prob
        
    return output

In [4]:
def predictSinglePoint(dictionary,x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for curr_class in classes:
        if(curr_class == 'total_data'):
            continue
        p_cur_class = probability(dictionary,x,curr_class)
        if(first_run or p_cur_class>best_p):
            
            best_p = p_cur_class
            best_class = curr_class
        first_run = False
    return best_class

In [5]:
def predict(result, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(dictionary,x)
        y_pred.append(x_class)
    return y_pred

In [6]:
def makeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5 * second_limit
    for i in range(0, len(column)):
        if(column[i]<first_limit):
            column[i] = 0
        elif(column[i]<second_limit):
            column[i] = 1
        elif(column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [7]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [8]:
for i in range(0,X.shape[-1]):
    X[:,i] = makeLabelled(X[:,i])
    

In [9]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,y, test_size= 0.25, random_state= 0)

In [10]:
dictionary = fit(X_train,Y_train)

In [11]:
y_pred = predict(dictionary,X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test, y_pred))
print(confusion_matrix(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


In [14]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,Y_train)
clf.predict(X_test)
print(classification_report(Y_test, y_pred))
print(confusion_matrix(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]
