# 自行開發高斯單純貝氏分類器
### 程式修改自[Implementing Naive Bayes Algorithm from Scratch](https://towardsdatascience.com/implementing-naive-bayes-algorithm-from-scratch-python-c6880cfc9c41)

## 載入相關套件

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

## NaiveBayes 演算法

In [2]:
# 貝氏定理 P(y|X) = P(X|y) * P(y) / P(X)
class NaiveBayesClassifier():
    # 計算常態分配的機率(pdf)：P(X)
    def gaussian_density(self, class_idx, x):     
        '''
        常態分配 pdf 公式:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²))
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    # 計算後驗機率 P(y|X)
    def calc_posterior(self, x):
        posteriors = []

        # 計算每一類的後驗機率 P(y|X)
        for i in range(self.count):
            # 使用 log 比較穩定
            prior = np.log(self.prior[i]) 
            conditional = np.sum(np.log(self.gaussian_density(i, x))) 
            posterior = prior + conditional
            posteriors.append(posterior)
        
        # 傳回最大機率的類別
        return self.classes[np.argmax(posteriors)]
     
    # 訓練
    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        # 計算每個特徵的平均數、變異數
        data = np.concatenate((target.reshape(-1, 1), features), axis=1)
        self.mean = np.array([np.mean(data[data[:,0]==i, 1:], axis=0) 
                              for i in self.classes])
        self.var = np.array([np.var(data[data[:,0]==i, 1:], axis=0) 
                             for i in self.classes]) 
        # 計算先驗機率 P(y)
        self.prior = np.array([target[target==i].shape[0] 
                               for i in self.classes]) / self.rows
        
    # 預測
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features]
        return preds

## 載入資料集

In [3]:
X, y = datasets.load_iris(return_X_y=True)

## 資料分割

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

## 選擇演算法

In [5]:
clf = NaiveBayesClassifier()

## 模型訓練

In [6]:
clf.fit(X_train, y_train)

## 模型評估

In [7]:
# 計算準確率
y_pred = clf.predict(X_test)
print(f'{accuracy_score(y_test, y_pred)*100:.2f}%') 

96.67%
