In [1]:
import numpy as np
import pandas as pd

# Naive Bayes Classifier 
It is a conditional probability model, with formula: <br>
$ P(C| x_1, x_2, x_3, ...) = \frac{P(C)P(X|C)}{P(X)}$ <br>
It is naive because we have naive assumption such that every pair of features are independent from each other given C.<br>
So we can rewrite the formula as: <br>
$ P(C| x_1, x_2, x_3, ...) = P(C)P(x_1|C)P(x_2|C)... = P(C)\prod^{n}_{i=1} P(x_i|C)$

In [1]:
"""
Naive Bayes classifer
By: Minchan Kim
"""

import pandas as pd
import numpy as np

class Naive_Bayes():
    """
    Naive Bayes classifer
    
    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """
    
    def __init__(self):
        """
            Some initializations, if neccesary
        """
        self.model_name = 'Naive Bayes'
    
    
    def fit(self, X_train, y_train):
        """ 
            The fit function fits the Naive Bayes model based on the training data. 
            Here, we assume that all the features are **discrete** features. 
            
            X_train is a matrix or 2-D numpy array, representing training instances. 
            Each training instance is a feature vector. 

            y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.
        """
        # Compute the prior distribution of all y labels
        self.prior = dict()
        for y in y_train:
            prior_key = f'Y = {y}'
            if prior_key in self.prior.keys():
                self.prior[prior_key] += 1
            else:
                self.prior[prior_key] = 1

        # Normalize the prior distribution
        for key in self.prior.keys():
            self.prior[key] /= len(y_train)

        # Compute the likelihood distribution of all X_j given Y
        self.likelihood = dict()
        for x, y in zip(np.array(X_train), y_train):
            for j in range(len(x)):
                likelihood_key = f'X{j} = {x[j]} | Y = {y}'
                if likelihood_key in self.likelihood.keys():
                    self.likelihood[likelihood_key] += 1
                else:
                    self.likelihood[likelihood_key] = 1

        # Normalize the likelihood distribution
        for key in self.likelihood.keys():
            self.likelihood[key] /= len(y_train)
        
        
    def ind_predict(self, x : list):
        """ 
            Predict the most likely class label of one test instance based on its feature vector x.
        """
        best_label, best_prob = None, float('-inf')
    
        # Iterate through each class to compute its posterior probability
        for y, prior_count in self.prior.items():
            cur_prob = prior_count
            
            # Product sum likelihoods for each feature given the class
            for j, feature_value in enumerate(x):
                likelihood_key = f'X{j} = {feature_value} | {y}'
                if likelihood_key in self.likelihood:
                    cur_prob *= self.likelihood[likelihood_key]
                else:
                    # Handle unknown feature values - Assume very small probability
                    cur_prob *= 10^-9
                    
            if cur_prob > best_prob:
                best_prob = cur_prob
                best_label = y
        
        return best_label[-1]
        
    
    def predict(self, X):
        """
            X is a matrix or 2-D numpy array, represnting testing instances. 
            Each testing instance is a feature vector. 
            
            Return the predictions of all instances in a list.
        """
        return [self.ind_predict(x) for x in np.array(X)]

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
col = ['class_name','left_weight','left_distance','right_weight','right_distance']
data = pd.read_csv(url, delimiter = ',', names = col)

In [3]:
data

Unnamed: 0,class_name,left_weight,left_distance,right_weight,right_distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [4]:
data.class_name.value_counts()

R    288
L    288
B     49
Name: class_name, dtype: int64

In [5]:
X = np.matrix(data.iloc[:,1:])
y = data.class_name
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 88)

In [6]:
X_train

matrix([[1, 5, 2, 5],
        [1, 3, 5, 1],
        [3, 5, 1, 3],
        ...,
        [3, 5, 3, 3],
        [4, 3, 2, 4],
        [3, 2, 3, 4]])

In [7]:
X_train

matrix([[1, 5, 2, 5],
        [1, 3, 5, 1],
        [3, 5, 1, 3],
        ...,
        [3, 5, 3, 3],
        [4, 3, 2, 4],
        [3, 2, 3, 4]])

In [8]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

Overall Accuracy

In [9]:
sum(y_hat == y_test)/ 207  # you should get something like 0.88

0.8260869565217391

In [13]:
from sklearn.naive_bayes import GaussianNB
y_pred = GaussianNB().fit(np.array(X_train), y_train).predict(np.array(X_test))

In [14]:
sum(y_pred == y_test)/ 207

0.893719806763285

In [184]:
y_hat

['Y = R',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = R',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = R',
 'Y = L',
 'Y = L',


In [123]:
prior = dict()
for y in y_train:
    prior_key = f'Y = {y}'
    if prior_key in prior:
        prior[prior_key] += 1
    else:
        prior[prior_key] = 1

for key in prior.keys():
    prior[key] /= len(y_train)

In [132]:
likelihood = dict()
for x, y in zip(np.array(X_train), y_train):
    for j in range(len(x)):
        likelihood_key = f'X{j} = {x[j]} | Y = {y}'
        if likelihood_key in likelihood:
            likelihood[likelihood_key] += 1
        else:
            likelihood[likelihood_key] = 1

# Normalize the likelihood distribution
for key in likelihood.keys():
    likelihood[key] /= len(y_train)

In [166]:
c = [x for x in np.array(X_test)][3]

In [169]:
for j, feature_value in enumerate(c):
    likelihood_key = f'X{j} = {feature_value} | {y}'
    print(likelihood_key)

X0 = 4 | Y = B
X1 = 1 | Y = B
X2 = 3 | Y = B
X3 = 2 | Y = B


In [170]:
best_label, best_log_prob = None, float('-inf')
for y, prior_count in prior.items():
    log_prob = prior_count
    print(log_prob)
    for j, feature_value in enumerate(c):
        likelihood_key = f'X{j} = {feature_value} | {y}'
        if likelihood_key in likelihood:
            log_prob *= likelihood[likelihood_key]
        else:  
            log_prob *= 10^-9
    print(log_prob)
    if log_prob > best_log_prob:
        best_log_prob = log_prob
        best_label = y
print(best_label)

0.44019138755980863
2.2856373647688273e-05
0.48325358851674644
1.2615400295386861e-05
0.07655502392344497
5.898015191780026e-09
Y = R
