# 贝叶斯决策基本概念

## 基本原理
对每个样本x, 选择能使后验概率$P(c|x)$最大的类别标记

## 条件概率
![](./img/5_1.png)
![](./img/5_2.png)

# 朴素贝叶斯决策模型
+ 朴素贝叶斯模型采用了“属性条件独立性假设”， 即所有属性间相互独立

$$
\begin{array}{l}
p\left(\mathrm{y}_{i} \mid x\right)=\frac{p\left(x \mid \mathrm{y}_{i}\right) p\left(\mathrm{y}_{i}\right)}{p(x)} \quad \text { 可转换为 }: \\
p\left(\mathrm{y}_{i} \mid x\right)=p\left(x_{1} \mid \mathrm{y}_{i}\right) p\left(x_{2} \mid \mathrm{y}_{i}\right) \ldots p\left(x_{n} \mid \mathrm{y}_{i}\right) p\left(\mathrm{y}_{i}\right)
\end{array}
$$

![](./img/5_3.png)

# 朴素贝叶斯代码实现

In [2]:
import numpy as np

def loaddata():
    X = np.array([[1,'S'], [1,'M'], [1,'M'], [1,'S'],
         [1, 'S'], [2, 'S'], [2, 'M'], [2, 'M'],
         [2, 'L'], [2, 'L'], [3, 'L'], [3, 'M'],
         [3, 'M'], [3, 'L'], [3, 'L']])
    y = np.array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])
    return X, y

In [17]:
from collections import Counter
def get_singlefeature_probability(x, y, y_class=1):
    probability = {}
    x = np.compress(y == y_class, x, axis=0)
    xcount = Counter(x)
    ycount = sum(y == y_class)
    for k, v in xcount.items():
        key = f"{k}_{y_class}"
        probability[key] = v / ycount
    return probability

def get_xprobability(X, y):
    xprobability = {}
    m, n = X.shape
    for i in range(n):
        for j in set(y):
            p = get_singlefeature_probability(X[:, i], y, y_class=j)
            p = {f"{i}_{k}": v for k, v in p.items()}
            xprobability.update(p)
    return xprobability

def get_yprobability(y):
    yprobability = {}
    ysize = y.size
    ycount = Counter(y)
    for k, v in ycount.items():
        yprobability[f"{k}"] = v / ysize
        
    return yprobability
    

def predict(X):
    probability = {}
    n = X.size
    for label in set(y):
        keys = [f"{i}_{X[i]}_{label}" for i in range(n)]
        xprob = [xprobability.get(key) for key in keys]
        yprob = yprobability[f"{label}"]
        probability[f"{label}"] = xprob[0] * xprob[1] * yprob
    
    return sorted(probability.items(), key=lambda x: x[1], reverse=True)[0][0]

In [18]:
X, y = loaddata()
X, y
xprobability = get_xprobability(X, y)
yprobability = get_yprobability(y)
xprobability, yprobability
X_new = np.array([3, 'M'])
predict(X_new)

(array([['1', 'S'],
        ['1', 'M'],
        ['1', 'M'],
        ['1', 'S'],
        ['1', 'S'],
        ['2', 'S'],
        ['2', 'M'],
        ['2', 'M'],
        ['2', 'L'],
        ['2', 'L'],
        ['3', 'L'],
        ['3', 'M'],
        ['3', 'M'],
        ['3', 'L'],
        ['3', 'L']], dtype='<U11'),
 array([-1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1]))

({'0_1_1': 0.2222222222222222,
  '0_2_1': 0.3333333333333333,
  '0_3_1': 0.4444444444444444,
  '0_1_-1': 0.5,
  '0_2_-1': 0.3333333333333333,
  '0_3_-1': 0.16666666666666666,
  '1_M_1': 0.4444444444444444,
  '1_S_1': 0.1111111111111111,
  '1_L_1': 0.4444444444444444,
  '1_S_-1': 0.5,
  '1_M_-1': 0.3333333333333333,
  '1_L_-1': 0.16666666666666666},
 {'-1': 0.4, '1': 0.6})

'1'

# 拉普拉斯修正

为防止某个概率为0， 造成最终的乘积为0的情况，修正方法如下：
$$
\begin{array}{ll}
p(y)=\frac{\left|D_{y}\right|+1}{|D|+N} \quad & \text { 其中N为训练集D中可能的类别数 } \\
P\left(x_{i} \mid y\right)=\frac{\left|D_{y, x}\right|+1}{\left|D_{y}\right|+N_{i}} & \text { 其中Ni表示第i个属性可能的取值数 }
\end{array}
$$

# 朴素贝叶斯如何处理连续数据
$$
\begin{aligned}
&\text { 假定 } p\left(x_{i} \mid y\right) \sim \mathrm{N}\left(\mu_{y, i}, \sigma_{y, i}^{2}\right) \text { 其中 } \mu_{y, i} \text { 和 } \sigma_{y, i}^{2}\\
&\text { 分别是第y类样本在第i个属性上取值的均值和方差 }\\
&p\left(x_{i} \mid y\right)=\frac{1}{\sqrt{2 \pi} \sigma_{y, i}} \exp \left(-\frac{\left(x_{i}-\mu_{y, i}\right)^{2}}{2 \sigma_{y, i}^{2}}\right)
\end{aligned}
$$

# Sklearn

In [1]:
from sklearn import naive_bayes as nb
from sklearn.preprocessing import LabelEncoder

In [20]:
X, y = loaddata()
X, y

(array([['1', 'S'],
        ['1', 'M'],
        ['1', 'M'],
        ['1', 'S'],
        ['1', 'S'],
        ['2', 'S'],
        ['2', 'M'],
        ['2', 'M'],
        ['2', 'L'],
        ['2', 'L'],
        ['3', 'L'],
        ['3', 'M'],
        ['3', 'M'],
        ['3', 'L'],
        ['3', 'L']], dtype='<U11'),
 array([-1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1]))

In [21]:
lbe = LabelEncoder().fit(X[:, 1])
X[:, 1] = lbe.transform(X[:, 1])
X = X.astype('int')
X

array([[1, 2],
       [1, 1],
       [1, 1],
       [1, 2],
       [1, 2],
       [2, 2],
       [2, 1],
       [2, 1],
       [2, 0],
       [2, 0],
       [3, 0],
       [3, 1],
       [3, 1],
       [3, 0],
       [3, 0]])

In [22]:
model = nb.MultinomialNB()
model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
lbe.transform(['M'])
model.predict([[3, 1]])

array([1], dtype=int64)

array([1])

# 垃圾邮件分类

In [3]:
from sklearn import naive_bayes as nb
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [4]:
def createVocabList(dataSet):
    vocabSet = set([])  #创建空的词汇表
    for document in dataSet:
        #词汇表和set(document)取并集
        vocabSet = vocabSet | set(document) 
    #返回一个经过自然排序的词汇表
    return sorted(list(vocabSet))

In [5]:
dataset = [['i','love','you'],
           ['he','love','you']]
vocablist = createVocabList(dataset)
print(vocablist)

['he', 'i', 'love', 'you']


## 词集模型

In [6]:
def setOfWords2Vec(vocabList, inputSet):
    #初始化向量，其长度与词汇表长度一致
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            #把词汇表对应位置设为1
            returnVec[vocabList.index(word)] = 1
        else: print("词: %s 不在词汇表中!" % word)
    return returnVec

In [7]:
print(vocablist)
print(setOfWords2Vec(vocablist,['love','you','you']))

['he', 'i', 'love', 'you']
[0, 0, 1, 1]


## 词袋模型

In [8]:
def bagOfWords2Vec(vocabList, inputSet):
    #初始化向量，其长度与词汇表长度一致
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            #把词汇表对应位置加1
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [9]:
print(vocablist)
print(bagOfWords2Vec(vocablist,['love','you','you']))

['he', 'i', 'love', 'you']
[0, 0, 1, 2]


# 对邮件预处理

In [26]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def loaddata():
    docList = []
    classList = []

    num = 26
    for i in range(1, num):
        wordList = textParse(open('data/5_1email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)
        
        wordList = textParse(open('data/5_1email/ham/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    
    X = []
    for docIndex in range(len(docList)):
        X.append(bagOfWords2Vec(vocabList, docList[docIndex]))
        
    return X, classList, vocabList

In [29]:
X,y,vocaList = loaddata()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.20)
print(X_train[:3])
y[:3]

[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

[1, 0, 1]

In [31]:
model = nb.MultinomialNB()
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
accuracy_score(y_test, y_pre)
precision_score(y_test, y_pre)
recall_score(y_test, y_pre)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

1.0

1.0

1.0

In [33]:
dir(nb)

['ABCMeta',
 'BaseDiscreteNB',
 'BaseEstimator',
 'BaseNB',
 'BernoulliNB',
 'CategoricalNB',
 'ClassifierMixin',
 'ComplementNB',
 'GaussianNB',
 'LabelBinarizer',
 'MultinomialNB',
 '_ALPHA_MIN',
 '_BaseDiscreteNB',
 '_BaseNB',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_check_partial_fit_first_call',
 '_check_sample_weight',
 'abstractmethod',
 'binarize',
 'check_X_y',
 'check_array',
 'check_is_fitted',
 'check_non_negative',
 'column_or_1d',
 'deprecated',
 'label_binarize',
 'logsumexp',
 'np',
 'safe_sparse_dot',