# Capter4 朴素贝叶斯
## 贝叶斯模型

In [1]:
import pandas as pd
import numpy as np
class NaiveBayes():
    def fit(self, x_df, y):
        self.fea_names = list(x_df.columns)
        df = x_df.copy()
        df['labels'] = y
        self.total = df.shape[0]
        self.y_num_df = pd.DataFrame(df.groupby('labels')['labels'].count())
        self.y_num_df.columns = ['y_num']
        self.y_kind = self.y_num_df.shape[0]
        self.num_dict = {}
        self.x_kind_dict = {}
        for col in self.fea_names:
            x_num_df = pd.DataFrame(df.groupby([col, 'labels'])[col].count())
            x_num_df.columns=['x_num']
            self.num_dict[col] = x_num_df.join(self.y_num_df, how='outer')
            self.x_kind_dict[col] = df[col].unique().shape[0]

    def maxlikelihood(self, x):
        p_df = self.y_num_df.div(self.total)
        p_df.columns = ['y_p']
        for col in self.fea_names:
            num_df = pd.DataFrame(self.num_dict[col].loc[x[col],:])
            p_df[col] = num_df['x_num']/num_df['y_num']
        return p_df.fillna(0).prod(axis=1).idxmax()
    
    def maxlikelihood_predict(self, x_df):
        predict = x_df.apply(self.maxlikelihood, axis=1)
        return predict.values

    def bayes(self, x, lr):
        p_df = self.y_num_df.add(lr).div(self.total + lr * self.y_kind)
        p_df.columns = ['y_p']
        for col in self.fea_names:
            if x[col] not in self.num_dict[col].index:
                self.num_dict[col].loc[x[col],:] = 0
            num_df = pd.DataFrame(self.num_dict[col].loc[x[col],:])
            p_df[col] = num_df['x_num'].add(lr).div(num_df['y_num'].add(lr * self.x_kind_dict[col]))
        return p_df.fillna(0).prod(axis=1).idxmax()

    def bayes_predict(self, x_df, lr):
        predict = x_df.apply(self.bayes, axis=1, args=(lr,))
        return predict.values

    def get_accuracy(self, gt_y, pre_y):
        diff = np.subtract(gt_y, pre_y)
        err = diff[diff!=0].shape[0]
        return 1 - err/gt_y.shape[0]

## 模型测试
### 准备数据

In [6]:
%matplotlib widget
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns

data = datasets.load_iris()
x_data = data.data
y_data = data.target
x_names = data.feature_names
y_names = data.target_names

data_np = np.column_stack((x_data, y_data))
np.random.shuffle(data_np)
data_cols = list(x_names.copy())
data_cols.append('labels')
df = pd.DataFrame(data_np, columns=data_cols)
for col in x_names:
    df[col] = pd.cut(df[col], 5, labels=range(5))

train_num = 120
train_x = df.iloc[:train_num, :4]
train_y = df.iloc[:train_num, 4:].values.reshape([train_num,])
test_x = df.iloc[train_num:, :4]
test_y = df.iloc[train_num:, 4:].values.reshape([df.shape[0] - train_num,])
sns.pairplot(x_vars = x_names, y_vars = x_names, hue = 'labels', data = df)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<seaborn.axisgrid.PairGrid at 0x12b4b6650>

### 最大似然估计

In [7]:
model = NaiveBayes()
model.fit(train_x, train_y)
pre_y = model.maxlikelihood_predict(test_x)
print('accuracy: %.2f' % model.get_accuracy(test_y, pre_y))


accuracy: 0.93


### 贝叶斯估计

In [8]:
model = NaiveBayes()
model.fit(train_x, train_y)
pre_y = model.bayes_predict(test_x, 1)
print('accuracy: %.2f' % model.get_accuracy(test_y, pre_y))

accuracy: 0.93
