# 4.朴素贝叶斯
## 调包

In [1]:
import numpy as np

## 4.1. 原理

1.目标：求出样本W关于各个类$c_{i}$的后验概率，$p(c_{i}|W)$，以最大该概率的类，预测为W的类

2.用贝叶斯公式求该概率：
$$p(c_{i}|W)=\frac{p(W|c_{i})p(c_{i})}{p(W)}$$

3.朴素假设：样本间统计独立
$$p(W|c_{i})=p(w_{0},w_{1},w_{2},...,w_{n}|c_{i})=p(w_{0}|c_{i})\cdot p(w_{1}|c_{i})\cdot p(w_{2}|c_{i})\cdot ...\cdot p(w_{n}|c_{i})$$

## 4.2.简单的朴素贝叶斯应用
### 4.2.1.读取数据

In [3]:
dataSet = np.array([[1, 'S'], [1, 'L'], [1, 'M'], [1, 'M'], [1, 'S'], 
                    [2, 'L'], [2, 'S'], [2, 'S'], [2, 'L'], [2, 'L'], [2, 'M'],
                    [3, 'M'], [3, 'L'], [3, 'S'], [3, 'M'], [3, 'M']])
labels = np.array([[-1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1]])

### 4.2.2.计算$p(c_{i})$

In [34]:
categorys = set([])
pc = {}
#统计标签的值
for label in labels:
    categorys |= set(label)
#求各类的概率
for category in categorys:
    pc[str(category)] = len(labels[labels == np.int(category)]) / labels.shape[1]
    print("p(c = {}) = {}".format(np.int(category), pc[str(category)]))

p(c = 1) = 0.625
p(c = -1) = 0.375


### 4.2.3.计算$p(w_{j}|c_{i})$

In [22]:
numOfData = dataSet.shape[0]
numOfFeat = dataSet.shape[1]
features = {}
#统计各个特征的所有值
for i in range(numOfFeat):
    features["feature"+str(i)] = set([])
    for m in range(numOfData):
        features["feature"+str(i)] |= set(dataSet[m, i])
prob = {}
#对于每个类
for category in categorys:
    #对于每种特征
    for i in range(numOfFeat):
        #对于每种取值
        for feature in features["feature"+str(i)]:
            prob['p('+str(feature)+'|'+str(category)+')'] = np.sum(dataSet[(labels == np.int(category)).squeeze(), i] == feature) / np.sum(labels == np.int(category))
print(prob)

{'p(1|1)': 0.20000000000000001, 'p(3|1)': 0.40000000000000002, 'p(2|1)': 0.40000000000000002, 'p(S|1)': 0.20000000000000001, 'p(L|1)': 0.40000000000000002, 'p(M|1)': 0.40000000000000002, 'p(1|-1)': 0.5, 'p(3|-1)': 0.16666666666666666, 'p(2|-1)': 0.33333333333333331, 'p(S|-1)': 0.5, 'p(L|-1)': 0.16666666666666666, 'p(M|-1)': 0.33333333333333331}


### 4.2.4.计算$p(W|c_{i})p(c_{i})$并比较概率的相对大小

In [43]:
resultProb = {}
inputFeat = {}
for i in range(numOfFeat):
    inputFeat['feature'+str(i)] = input('请输入特征 {} = '.format(i))
for category in categorys:
    pWc = 1
    for i in range(numOfFeat):
        pWc *= prob['p('+inputFeat['feature'+str(i)]+'|'+str(category)+')']
    resultProb[str(category)] = pWc * pc[str(category)]
import operator
resultProb = sorted(resultProb.items(),key = operator.itemgetter(1), reverse = True)
print(resultProb)
print("最有可能的类为 {}".format(resultProb[0][0]))

请输入特征 0 = 2
请输入特征 1 = S
[('-1', 0.0625), ('1', 0.05000000000000001)]
最有可能的类为 -1


当输入特征为（2， S）时，最有可能的分类为 -1