In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets
from sklearn import model_selection

In [2]:
def fit(X_train, Y_train):
    result = {}
    classes = set(Y_train)
    result['total_data_length'] = len(Y_train)
    for current_class in classes:
        result[current_class] = {}
        current_class_data = X_train[Y_train == current_class]
        result[current_class]['total_class_data_length'] = len(current_class_data)
        for i in range(X_train.shape[1]):
            result[current_class][i] = {}
            feature_values = set(X_train[:, i])
            for value in feature_values:
                result[current_class][i][value] = ((current_class_data[:, i] == value).sum())
    
    return result

In [3]:
def probability(dictionary, x, current_class):
    result = np.log(dictionary[current_class]['total_class_data_length']) - np.log(dictionary['total_data_length'])
    
    for i in range(x.shape[0]):
        numerator = dictionary[current_class][i][x[i]] + 1
        denominator = dictionary[current_class]['total_class_data_length'] + len(dictionary[current_class][i].keys())
        result += (np.log(numerator) - np.log(denominator))
        
    return result

In [4]:
def predict_single_point(dictionary, x):
    classes = list(dictionary.keys())[1:]
    best_probability = -1000
    best_class = -1
    
    for current_class in classes:
        current_class_probability = probability(dictionary, x, current_class)
        if(current_class_probability > best_probability):
            best_probability = current_class_probability
            best_class = current_class 
    
    return best_class

In [5]:
def predict(dictionary, X_test):
    Y_preds = []
    for x in X_test:
        Y_preds.append(predict_single_point(dictionary, x))
    return Y_preds

# 1. Testing on Play Tennis Dataset 

In [6]:
df = pd.read_csv('datasets/play_tennis.csv')

In [7]:
df.drop('day', axis = 1, inplace = True)

In [8]:
X_train = np.array(df.iloc[:, :-1])
Y_train = np.array(df.iloc[:, -1])

In [9]:
dictionary = fit(X_train, Y_train)
Y_preds = predict(dictionary, X_train)

In [10]:
print(classification_report(Y_train, Y_preds))
print(confusion_matrix(Y_train, Y_preds))

              precision    recall  f1-score   support

          No       1.00      0.80      0.89         5
         Yes       0.90      1.00      0.95         9

    accuracy                           0.93        14
   macro avg       0.95      0.90      0.92        14
weighted avg       0.94      0.93      0.93        14

[[4 1]
 [0 9]]


# 2. Testing on Iris Dataset

In [11]:
def convert_to_labelled(column):
    l2 = column.mean()
    l1 = 0.5*l2
    l3 = 1.5*l2
    
    for i in range(len(column)):
        if(column[i] < l1):
            column[i] = 0
        elif(column[i] < l2):
            column[i] = 1
        elif(column[i] < l3):
            column[i] = 2
        else:
            column[i] = 3
            
    return column

In [12]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [13]:
for i in range(X.shape[1]):
    X[:, i] = convert_to_labelled(X[:, i])

In [14]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.20, random_state = 1)

In [15]:
dictionary = fit(X_train, Y_train)
Y_preds = predict(dictionary, X_test)

In [16]:
print(classification_report(Y_test, Y_preds))
print(confusion_matrix(Y_test, Y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
