### 贝叶斯模型实现

In [1]:
import numpy as np


class GaussianNB():

    def fit(self, X, y):
        """模型拟合"""
        self.y_prior = [round(sum(y == i) / len(y), 6) for i in sorted(set(y))]
        self.features_param = []

        for i in sorted(set(y)):
            pos = np.where(y == i)
            features_data = X[pos]
            features_mean = np.mean(features_data, axis=0)
            features_std = np.std(features_data, axis=0)

            param = [(round(avg, 6), round(std, 6)) for avg, std in zip(features_mean, features_std)]
            self.features_param.append(param)

    def predict(self, x):
        """模型预测"""
        result = []
        for i in range(x.shape[0]):
            bayes_prob = []

            for j in range(len(self.y_prior)):
                x_param = self.features_param[j]
                y_param = self.y_prior[j]
                xi_conditional_prob = 1

                for k in range(len(x_param)):
                    xi_conditional_prob *= self.gauss_pro(x[i][k], x_param[k][0], x_param[k][1])
                bayes_prob.append(round(y_param * xi_conditional_prob, 6))
            result.append(np.where(bayes_prob == np.max(bayes_prob))[0][0])

        return np.array(result)

    def gauss_pro(self, v, miu, sigma):
        """高斯分布概率密度计算"""
        part1 = 1 / (sigma * np.sqrt(2 * np.pi))
        part2 = np.exp(-1 * (v - miu) ** 2 / (2 * sigma ** 2))
        return round(part1 * part2, 6)

### 使用

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

X 为训练数据，每一行表示一个样本，每个样本里有 4 个特征

In [3]:
X[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3.0, 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5.0, 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5.0, 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

y 为标签数据，每个值表示对应索引的具体分类

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

创建模型并填充模型

In [5]:
gnb = GaussianNB()
gnb.fit(X, y)

预测数据，一般情况下会分训练集和测试集，这里简单的训练集测试预测效果

In [6]:
res = gnb.predict(X)
print(res)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


### 步骤分解

查询标签数据中指定分类的索引

In [7]:
pos = np.where(y == 1)
pos

(array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),)

得到该分类的特征数据

In [8]:
features_data = X[pos]
features_data[:10]

array([[7.0, 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4.0, 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1.0],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4]])

求该分类每个特征的均值

In [9]:
features_mean = np.mean(features_data, axis=0)
features_mean

array([5.936, 2.7700000000000005, 4.26, 1.3259999999999998])

求该分类每个特征的标准差

In [10]:
features_std = np.std(features_data, axis=0)
features_std

array([0.5109833656783752, 0.31064449134018135, 0.4651881339845204,
       0.19576516544063702])

得到该分类每个特征的均值，标准差列表

In [11]:
list(zip(features_mean, features_std))

[(5.936, 0.5109833656783752),
 (2.7700000000000005, 0.31064449134018135),
 (4.26, 0.4651881339845204),
 (1.3259999999999998, 0.19576516544063702)]

In [12]:
param = [(round(avg, 6), round(std, 6)) for avg, std in zip(features_mean, features_std)]
param

[(5.936, 0.510983), (2.77, 0.310644), (4.26, 0.465188), (1.326, 0.195765)]

上面是得到某一类样本的每个特征的统计数据
下面得到所有分类的每个特征的统计数据

In [13]:
features_param=[]
for i in sorted(set(y)):
    pos = np.where(y == i)
    features_data = X[pos]
    features_mean = np.mean(features_data, axis=0)
    features_std = np.std(features_data, axis=0)

    param = [(round(avg, 6), round(std, 6)) for avg, std in zip(features_mean, features_std)]
    features_param.append(param)

每一行表示一个分类，每个分类里有4个特征的统计信息，每个统计信息包括均值和标准差

In [14]:
features_param

[[(5.006, 0.348947), (3.418, 0.377195), (1.464, 0.171767), (0.244, 0.106132)],
 [(5.936, 0.510983), (2.77, 0.310644), (4.26, 0.465188), (1.326, 0.195765)],
 [(6.588, 0.629489), (2.974, 0.319255), (5.552, 0.546348), (2.026, 0.27189)]]

得到样本大小

In [15]:
X.shape[0]

150

定义函数求正态分布概率密度函数

In [16]:
def gauss_pro(v, miu, sigma):
    """高斯分布概率密度计算"""
    part1 = 1 / (sigma * np.sqrt(2 * np.pi))
    part2 = np.exp(-1 * (v - miu) ** 2 / (2 * sigma ** 2))
    return round(part1 * part2, 6)

准备参数，测试概率密度函数

In [17]:
i = 51 # 第 i 个样本
k = 0  # 第 k 个特征
X[i][k] # 第 i 个样本的第 k 个特征

6.4

In [18]:
j = 0                       # 第 j 个分类
x_param = features_param[j] # 第 j 个分类的所有特征的统计信息
x_param

[(5.006, 0.348947), (3.418, 0.377195), (1.464, 0.171767), (0.244, 0.106132)]

In [19]:
# 该分类第 k 个特征的均值
x_param[k][0]

5.006

In [20]:
# 该分类第 k 个特征的标准差
x_param[k][1]

0.348947

In [21]:
# 求概率密度
gauss_pro(X[i][k],x_param[k][0],x_param[k][1])

0.000391

样本中每个分类的占比

In [22]:
y_prior = [round(sum(y == i) / len(y), 6) for i in sorted(set(y))]
y_prior

[0.333333, 0.333333, 0.333333]

计算第 i 个样本属于每个分类的概率

In [23]:
bayes_prob = []
for j in range(len(y_prior)):
    x_param = features_param[j]
    y_param = y_prior[j]
    xi_conditional_prob = 1
    print('i=%s, j=%s, x_param=%s, y_param=%s' % (i, j, x_param, y_param))

    for k in range(len(x_param)):        
        xi_conditional_prob *= gauss_pro(X[i][k], x_param[k][0], x_param[k][1])
        print('\tk=%s xi_conditional_prob=%s' % (k, xi_conditional_prob))
    bayes_prob.append(round(y_param * xi_conditional_prob, 6))
    print('prob=', round(y_param * xi_conditional_prob, 6))

i=51, j=0, x_param=[(5.006, 0.348947), (3.418, 0.377195), (1.464, 0.171767), (0.244, 0.106132)], y_param=0.333333
	k=0 xi_conditional_prob=0.000391
	k=1 xi_conditional_prob=0.00034993561600000004
	k=2 xi_conditional_prob=0.0
	k=3 xi_conditional_prob=0.0
prob= 0.0
i=51, j=1, x_param=[(5.936, 0.510983), (2.77, 0.310644), (4.26, 0.465188), (1.326, 0.195765)], y_param=0.333333
	k=0 xi_conditional_prob=0.516955
	k=1 xi_conditional_prob=0.254701143725
	k=2 xi_conditional_prob=0.1912112802263818
	k=3 xi_conditional_prob=0.2625074654392719
prob= 0.087502
i=51, j=2, x_param=[(6.588, 0.629489), (2.974, 0.319255), (5.552, 0.546348), (2.026, 0.27189)], y_param=0.333333
	k=0 xi_conditional_prob=0.606113
	k=1 xi_conditional_prob=0.589534597224
	k=2 xi_conditional_prob=0.06743037769588389
	k=3 xi_conditional_prob=0.015228409068460723
prob= 0.005076


得到每个概率分布

In [24]:
bayes_prob

[0.0, 0.087502, 0.005076]

得到最大概率的分类

In [25]:
np.where(bayes_prob == np.max(bayes_prob))[0][0]

1