In [None]:
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
import numpy as np 
import pandas as pd 
import heapq 
from collections import Counter 

In [None]:
dataset = load_iris()
X, y = dataset['data'].astype(np.str0), dataset['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1)

# **Bayesian Learning**

In [None]:
# X = [['Red', 'Sports', 'Domestic'],
#      ['Red', 'Sports', 'Domestic'],
#      ['Red', 'Sports', 'Domestic'],
#      ['Yellow', 'Sports', 'Domestic'],
#      ['Yellow', 'Sports', 'Imported'],
#      ['Yellow', 'SUV', 'Imported'],
#      ['Yellow', 'SUV', 'Imported'],
#      ['Yellow', 'SUV', 'Domestic'],
#      ['Red', 'SUV', 'Imported'],
#      ['Red', 'Sports', 'Imported']]
# y = ['Yes',
#      'No',
#      'Yes',
#      'No',
#      'Yes',
#      'No',
#      'Yes',
#      'No',
#      'No',
#      'Yes']
# sample = ['Red', 'SUV', 'Domestic']
# X, y = np.array(X), np.array(y)

In [None]:
# X = [['Rainy',	'Hot',	'High',	'False'],
#      ['Rainy',	'Hot', 'High',	'True'],
#      ['Overcast',	'Hot',	'High',	'False'],
#      ['Sunny',	'Mild',	'High',	'False'],
#      ['Sunny',	'Cool',	'Normal',	'False'],
#      ['Sunny',	'Cool',	'Normal',	'True'],
#      ['Overcast',	'Cool',	'Normal',	'True'],
#      ['Rainy',	'Mild',	'High',	'False'],
#      ['Rainy',	'Cool',	'Normal',	'False'],
#      ['Sunny',	'Mild',	'Normal',	'False'],
#      ['Rainy',	'Mild',	'Normal',	'True'],
#      ['Overcast',	'Mild',	'High',	'True'],
#      ['Overcast',	'Hot',	'Normal',	'False'],
#      ['Sunny',	'Mild',	'High',	'True']]
# y = ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
# sample = ['Sunny', 'Hot', 'Normal', 'False']
# X, y = np.array(X), np.array(y)

In [None]:
def dataset_dist(X, y):
  X_dist = dict()
  y_dist = dict()
  for y_unique in np.unique(y):
    idx = np.where(y==y_unique)[0]
    X_slice = X[idx]
    y_slice = y[idx]
    y_dist[y_unique] = len(y_slice) / len(y)
    column_values_dist = list()
    for column_idx in range(X_slice.shape[1]):
      X_column = pd.Series(data=X_slice[:, column_idx]).value_counts()
      X_column /= len(y_slice)
      X_column = dict(X_column)
      column_values_dist.append(X_column)
    X_dist[y_unique] = column_values_dist 
  return X_dist, y_dist

In [None]:
X_dist, y_dist = dataset_dist(X_train, y_train)

In [None]:
pred = list()

for sample in X_test:
  probs = dict()
  for label in X_dist.keys():
    prob = 1 
    for attribute_idx in range(len(sample)):
      attribute = sample[attribute_idx]
      attribute_slice = X_dist[label][attribute_idx]
      if attribute in list(attribute_slice.keys()):
        prob *= attribute_slice[attribute]
      else: # Laplace smoothing 
        n = len(np.where(y_train==label)[0]) 
        nc = 0 
        m = np.unique(y_train).shape[0] # number of virtual examples 
        p = 1 # prior estimate 
        prob *= ((nc + (m * p)) / (n + m)) 
    prob *= y_dist[label] 
    probs[label] = prob 

  prob_values = list(probs.values())
  max_prob = np.argmax(prob_values)
  labels = list(probs.keys())
  pred.append(labels[max_prob])
  # print(f'Class: {labels[max_prob]} - Prob: {prob_values[max_prob]}')

In [None]:
accuracy_score(y_test, pred)

0.8

# **Gaussian Naive Bayes**

In [None]:
pred = list()

for sample in X_test:
  probs = dict()
  for label in X_dist.keys():
    prob = 1 
    label_idx = np.where(y_train==label)[0]
    X_slice = X_train[label_idx].astype(np.float16)
    for attribute_idx in range(len(sample)):
      attribute_var = np.var(X_slice[:, attribute_idx])
      attribute_mean = np.mean(X_slice[:, attribute_idx])
      attribute = sample[attribute_idx].astype(np.float16)
      prob_value = (1 / np.sqrt(2 * np.pi * attribute_var)) * np.exp(-(((attribute - attribute_mean) ** 2) / (2 * attribute_var)))
      prob *= prob_value 
    prob *= y_dist[label] 
    probs[label] = prob 

  prob_values = list(probs.values())
  max_prob = np.argmax(prob_values)
  labels = list(probs.keys())
  pred.append(labels[max_prob])

In [None]:
accuracy_score(y_test, pred)

1.0