# Naive Bayes Classifier

### Define the model

In [6]:
import numpy as np

class NaiveBayes():
    
    def __inint__(self):
        """
        Initialize the model
        
        Arguments:
        prior_prob -- prior probabilities of each Y=ck
        x_variety -- a list for the potential types of each dimension in feature space
        con_probs -- conditional probabilities of P(X=x|Y=ck)
        
        """
        self.prior_prob = None
        self.x_variety = None
        self.con_probs = None
        
        return None
    
    
    def fit(self, X_train, Y_train):
        """
        Fit model with given features and labels
        
        Arguments:
        X_train -- input training dataset,  shape = [n_samples, n_features]
        Y_train -- labels of training data, shape = [n_samples]
        
        """
        # Get a list of prior probability of Y==ck
        self.prior_prob = self.compute_prior_prob(Y_train)
        # Get the num of input dimensions
        dim_x = X_train.shape[1]
        
        # Define a lise of x_variety: the num of potential types in each dimension
        self.x_variety = []
        # Get the max value of potential types in all dimensions, and set it as dim of prob_matrix
        for i in range(dim_x):
            current_column = X_train[:,i]
            self.x_variety.append(np.unique(current_column))
        max_variety = np.max([len(x_i) for x_i in self.x_variety])
        
        # Define conditional probability of x under y: dict={'Y=ck': prob_matrix[n_dims, n_variety]}
        self.con_probs = {}
        for label in np.unique(Y_train):
            prob_x_under_yi = X_train[np.where(Y_train==label)] 
            # Calculate prob matrix with the shape of [n_dims, n_variety]
            prob_matrix = np.zeros([dim_x, max_variety]) 
            for i in range(dim_x):
                for j in range(max_variety):
                    prob = self.compute_conditional_prob(prob_x_under_yi,dim_x=i, class_x=j) 
                    prob_matrix[i,j] = prob
            # Add current conditional probability into all types of Y
            self.con_probs[label] = prob_matrix
            
        return None
    
    
    def predict(self, x):
        """
        Predict the label of input query 
        
        Argument:
        x -- feature vector of input query
        
        Return:
        rpediction -- the most likely result of Y
        
        """       
        # Get the posterior probabilities of each kind of Y
        d = self.compute_posterior_prob(x)
        # Find the label which has the maximum likelihood
        prediction = max(d, key=d.get)
        # Return the result of predicting
        
        return prediction
    
    
    def compute_prior_prob(self, data_list):
        """
        Compute the prior probability of different type of Y==ck
        
        Argument:
        data_list -- input data,  shape = [n_samples]
        
        Return:
        d -- dict = {type of y: prior probability of corresponding y}
        
        """ 
        d = {}
        num = len(data_list)
        for n in data_list:
            if n in d:
                d[n] = d[n] + 1/num
            else:
                d[n] = 1/num
                
        return d
    
    
    def compute_conditional_prob(self,x, dim_x, class_x):
        """
        Compute the conditional probabilities of each P(Xj=xi|Y=ck)
        
        Arguments:
        x -- input dataset with the shape of [n_samples, n_features]
        dim_x -- the condition of j in P(Xj=xi|Y=ck)
        class_x -- the condition of i in P(Xj=xi|Y=ck)
        
        Return:
        con_prob -- the conditionl probability of P(Xj=xi|Y=ck)
        
        """ 
        # Extract the column which we want, and get its variety 
        column = x[:,dim_x]
        variety = np.unique(column)
        
        # Calculate the conditional probability under given conditions
        con_prob = 0
        # Whether over the bounndary
        if len(variety) <= class_x:
            return con_prob
        else:
            for n in column:
                if n==variety[class_x]:
                    con_prob += 1/len(column)
            return con_prob
        
        
    def compute_posterior_prob(self, x):
        """
        Compute the posterior probabilities of each P(Y=ck|X=x)
        
        Argument:
        x -- feature vector of query point
        
        Return:
        posterior_prob -- the posterior probability of P(Y=ck|X=x)
        
        """     
        # Define a dict for different types of Y=ck
        posterior_prob = {}
        
        # Get posterior probability: P(Y=ck|X=x)
        for key,value in self.con_probs.items():
            prob_matrix = value
            temp_prob = 1
            for i in range(len(x)):
                j = np.where(self.x_variety[i]==x[i])
                temp_prob *= prob_matrix[i,j] 
            posterior_prob[key] = self.prior_prob[key] * temp_prob
        
        # Return all the potential posterior probabilities
        return posterior_prob
    

### Test

*P50 例 4.1*  
  
试由以下表格的训练数据学习一个朴素贝叶斯分类器并确定x=(2,S)<sup>(T)</sup>的类标记y. 表中X<sup>(1)</sup>, X<sup>(2)</sup>为特征, 取值的集合分别为A<sub>1</sub>={1,2,3}, A<sub>2</sub>={S,M,L}, Y为类标记, Y∈C={1,-1}.


|     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
| :--  | --: |
|**X<sup>(1)</sup>**| 1 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 |
|**X<sup>(2)</sup>**| S | M | M | S | S | S | M | M | L | L | L | M | M | L | L | 
|**     Y     **| -1| -1| 1 | 1 | -1| -1| -1| 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1| 

- generate artificial samples

In [2]:
x = np.array([[1,'S'], [1,'M'], [1,'M'], [1,'S'], [1,'S'], [2,'S'], [2,'M'], [2,'M'], [2,'L'], [2,'L'], 
              [3,'L'], [3,'M'], [3,'M'], [3,'L'], [3,'L']])
y = np.array([-1, -1, 1, 1, -1, -1, -1, 1 ,1 ,1 ,1 ,1 ,1 ,1, -1])

- creating and training model

In [54]:
model = NaiveBayes()
model.fit(X_train=x, Y_train=y)

- predicting

In [57]:
query = np.array([2, 'S'])
pred = model.predict(query)
print('Prediction: class = ', pred)

Prediction: class =  -1
