In [4]:
import numpy as np
import pandas as pd

# Naive Bayes Classifier 
It is a conditional probability model, with formula: <br>
$ P(C| x_1, x_2, x_3, ...) = \frac{P(C)P(X|C)}{P(X)}$ <br>
It is naive because we have naive assumption such that every pair of features are independent from each other given C.<br>
So we can rewrite the formula as: <br>
$ P(C| x_1, x_2, x_3, ...) = P(C)P(x_1|C)P(x_2|C)... = P(C)\prod^{n}_{i=1} P(x_i|C)$

In [119]:
class Naive_Bayes():
    """
    
    Naive Bayes classifer
    
    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """
    
    def __init__(self):
        """
            Some initializations, if neccesary
        """
        
        self.model_name = 'Naive Bayes'
    
    
    def fit(self, X_train: np.array, y_train: np.array):
        
        """ 
            The fit function fits the Naive Bayes model based on the training data. 
            Here, we assume that all the features are **discrete** features. 
            
            X_train is a matrix or 2-D numpy array, represnting training instances. 
            Each training instance is a feature vector. 

            y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.
        """
        
        """
            TODO: 1. Modify and add some codes to the following for-loop
                     to compute the correct prior distribution of all y labels.
                  2. Make sure they are normalized to a distribution.
        """
        
        # might as well store for making prediction easier. this tag index system is kind of annoying and i would prefer to not have to parse that information out. 
        self.y_labels = np.unique(y_train)
        # return self.y_labels
        
        self.y_counts = dict()
        for value in y_train:
            tag = f"Y = {value}"
            self.y_counts[tag] = self.y_counts.get(tag, 0) + 1
        self.prior = {k: v/len(y_train) for k, v in self.y_counts.items()}
        # return self.prior
            
        """
            TODO: 3. Modify and add some codes to the following for-loops
                     to compute the correct likelihood P(X_j | Y).
                  4. Make sure they are normalized to distributions.
        """
        
        self.likelihood = dict()
        for x, y in zip(X_train, y_train):
            x = np.array(x).reshape(-1) # taking the index of a matrix seems to force you to get back a matrix no matter what. 
            for j in range(len(x)):
                tag = f"X{j} = {x[j]} | Y = {y}"
                # self.likelihood[tag] = self.likelihood.get(tag, 0) + (1)
                self.likelihood[tag] = self.likelihood.get(tag, 0) + (1/self.y_counts[f"Y = {y}"])
                
        return self.likelihood

        """
            TODO: 5. Think about whether we really need P(X_1 = x_1, X_2 = x_2, ..., X_d = x_d)
                     in practice?
                  6. Does this really matter for the final classification results?
        """
        
        # no you don't. you can calculate that information on the fly when you need it. i suppose that if you have a ton of queries, it might be slightly more efficient to calculate those values beforehand. 

        
    def ind_predict(self, x : list):
        
        """ 
            Predict the most likely class label of one test instance based on its feature vector x.
        """
        
        """
            TODO: 7. Enumerate all possible class labels and compute the likelihood 
                     based on the given feature vector x. Don't forget to incorporate 
                     both the prior and likelihood.
                  8. Pick the label with the higest probability. 
                  9. How to deal with very small probability values, especially
                     when the feature vector is of a high dimension. (Hint: log)
                  10. How to how to deal with unknown feature values?
        """
        
        # ok so we're going to be calculating log probs. log is a monotonically increasing function so we can just compare the log probs.
        # if unkseen feature value across all just return 0 probability and no class. all logprobs will be -inf. leads to a tie, never setting ret. ret is returned as none. 
        
        # ret, max_prob = None, 0
        # for y in self.y_labels:
        #     prob = 1
        #     for index, value in enumerate(x):
        #         tag = f"X{index} = {value} | Y = {y}"
        #         prob *= self.likelihood.get(tag, 0) # no smoothing
                
        #     if prob > max_prob:
        #         max_prob = prob
        #         ret = y
                    
        # # print(ret)
        # return ret
        
        ret, max_logprob = None, -np.inf
        for y in self.y_labels:
            logprob = np.log(self.prior[f"Y = {y}"])
            for index, value in enumerate(x):
                tag = f"X{index} = {value} | Y = {y}"
                logprob += np.log(self.likelihood.get(tag, 0)) # no smoothing
            
            if logprob > max_logprob:
                max_logprob = logprob
                ret = y
        return ret
    
    
    def predict(self, X):
        
        print(X.shape)
        """
            X is a matrix or 2-D numpy array, represnting testing instances. 
            Each testing instance is a feature vector. 
            
            Return the predictions of all instances in a list.
        """
        
        """
            TODO: 11. Revise the following for-loop to call ind_predict to get predictions.
        """
        
        ret = []
        for x in X:
            ret.append(self.ind_predict(np.array(x).reshape(-1)))
        
        return ret

In [7]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
col = ['class_name','left_weight','left_distance','right_weight','right_distance']
data = pd.read_csv(url, delimiter = ',', names = col)

In [8]:
data

Unnamed: 0,class_name,left_weight,left_distance,right_weight,right_distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [9]:
data.class_name.value_counts()

R    288
L    288
B     49
Name: class_name, dtype: int64

In [10]:
X = np.matrix(data.iloc[:,1:])
y = data.class_name
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 88)

In [11]:
y_train.value_counts()

L    202
R    184
B     32
Name: class_name, dtype: int64

In [72]:
Naive_Bayes().fit(X_train, y_train)

{'X0 = 1 | Y = R': 0.3695652173913047,
 'X1 = 5 | Y = R': 0.0869565217391304,
 'X2 = 2 | Y = R': 0.15760869565217384,
 'X3 = 5 | Y = R': 0.3097826086956523,
 'X1 = 3 | Y = R': 0.17934782608695643,
 'X2 = 5 | Y = R': 0.331521739130435,
 'X3 = 1 | Y = R': 0.05978260869565216,
 'X0 = 3 | Y = L': 0.20792079207920802,
 'X1 = 5 | Y = L': 0.32178217821782196,
 'X2 = 1 | Y = L': 0.33168316831683187,
 'X3 = 3 | Y = L': 0.18316831683168325,
 'X0 = 5 | Y = L': 0.32178217821782196,
 'X1 = 4 | Y = L': 0.2970297029702972,
 'X2 = 5 | Y = L': 0.09900990099009903,
 'X3 = 1 | Y = L': 0.34653465346534673,
 'X0 = 3 | Y = R': 0.12499999999999994,
 'X1 = 2 | Y = R': 0.2445652173913042,
 'X2 = 4 | Y = R': 0.20652173913043467,
 'X3 = 4 | Y = R': 0.26630434782608686,
 'X0 = 5 | Y = R': 0.12499999999999994,
 'X1 = 1 | Y = R': 0.347826086956522,
 'X2 = 3 | Y = R': 0.23369565217391292,
 'X0 = 4 | Y = L': 0.2722772277227724,
 'X2 = 4 | Y = L': 0.1336633663366337,
 'X3 = 4 | Y = L': 0.1435643564356436,
 'X3 = 2 | Y

In [104]:
len(clf.likelihood.values())

60

In [120]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)

{'X0 = 1 | Y = R': 0.3695652173913047,
 'X1 = 5 | Y = R': 0.0869565217391304,
 'X2 = 2 | Y = R': 0.15760869565217384,
 'X3 = 5 | Y = R': 0.3097826086956523,
 'X1 = 3 | Y = R': 0.17934782608695643,
 'X2 = 5 | Y = R': 0.331521739130435,
 'X3 = 1 | Y = R': 0.05978260869565216,
 'X0 = 3 | Y = L': 0.20792079207920802,
 'X1 = 5 | Y = L': 0.32178217821782196,
 'X2 = 1 | Y = L': 0.33168316831683187,
 'X3 = 3 | Y = L': 0.18316831683168325,
 'X0 = 5 | Y = L': 0.32178217821782196,
 'X1 = 4 | Y = L': 0.2970297029702972,
 'X2 = 5 | Y = L': 0.09900990099009903,
 'X3 = 1 | Y = L': 0.34653465346534673,
 'X0 = 3 | Y = R': 0.12499999999999994,
 'X1 = 2 | Y = R': 0.2445652173913042,
 'X2 = 4 | Y = R': 0.20652173913043467,
 'X3 = 4 | Y = R': 0.26630434782608686,
 'X0 = 5 | Y = R': 0.12499999999999994,
 'X1 = 1 | Y = R': 0.347826086956522,
 'X2 = 3 | Y = R': 0.23369565217391292,
 'X0 = 4 | Y = L': 0.2722772277227724,
 'X2 = 4 | Y = L': 0.1336633663366337,
 'X3 = 4 | Y = L': 0.1435643564356436,
 'X3 = 2 | Y

In [121]:
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

(207, 4)


Overall Accuracy

In [122]:
sum(y_hat == y_test)/ 207  # you should get something like 0.88

0.8840579710144928

In [110]:
X_test[2]

matrix([[1, 2, 2, 1]])

In [118]:
# matrix([[1, 2, 2, 1]])
for y_label in clf.y_labels:
    print(y_label)
    print(clf.prior.get(f"Y = {y_label}"))
    print(clf.likelihood.get(f"X0 = 1 | Y = {y_label}"))
    print(clf.likelihood.get(f"X1 = 2 | Y = {y_label}"))
    print(clf.likelihood.get(f"X2 = 2 | Y = {y_label}"))
    print(clf.likelihood.get(f"X3 = 1 | Y = {y_label}"))


B
0.07655502392344497
0.15625
0.25
0.28125
0.1875
L
0.48325358851674644
0.059405940594059396
0.15841584158415847
0.2623762376237625
0.34653465346534673
R
0.44019138755980863
0.3695652173913047
0.2445652173913042
0.15760869565217384
0.05978260869565216


In [100]:
# naives bayes sklearn
from sklearn.naive_bayes import CategoricalNB
cnb = CategoricalNB()
cnb.fit(X_train, y_train)
cnb_pred = cnb.predict(X_test)
sum(cnb_pred == y_test) / 207



0.8840579710144928

In [116]:
# create dataframe with columns: y_hat, cnb_pred, and y_hat == cnb_pred
df = pd.DataFrame({'y_hat': y_hat, 'cnb_pred': cnb_pred, 'y_hat == cnb_pred': y_hat == cnb_pred})
df.head()

Unnamed: 0,y_hat,cnb_pred,y_hat == cnb_pred
0,R,R,True
1,L,L,True
2,B,L,False
3,B,R,False
4,B,L,False


In [107]:
np.unique(cnb_pred)

array(['L', 'R'], dtype='<U1')

In [108]:
np.unique(y_hat)

array(['B', 'L', 'R'], dtype='<U1')