In [70]:
import numpy as np
import pandas as pd
from sklearn.utils.multiclass import type_of_target
from collections import namedtuple

In [71]:

def train_nb(X, y):
    '''
    拉普拉斯修正的朴素贝叶斯分类训练
    输入：
        X:样本特征
        y:样本标签
    返回：
        p1:好瓜概率
        p1_list:正例中，各属性的条件概率，格式为色泽(is_continuous=False, conditional_pro=青绿  0.363636 乌黑  0.454545 浅白  0.181818)
        p0_list:负例中，各属性的条件概率
    '''
    m, n = X.shape
    p1 = (len(y[y == '是']) + 1) / (m + 2)  # 拉普拉斯平滑

    p1_list = []  # 用于保存正例下各属性的条件概率
    p0_list = []

    X1 = X[y == '是']
    X0 = X[y == '否']

    m1, _ = X1.shape
    m0, _ = X0.shape

    for i in range(n): # 遍历数据集每个特征列
        xi = X.iloc[:, i]
        p_xi = namedtuple(X.columns[i], ['is_continuous', 'conditional_pro'])  # 用于储存每个变量的情况
        is_continuous = type_of_target(xi) == 'continuous'
        xi1 = X1.iloc[:, i]
        xi0 = X0.iloc[:, i]
        if is_continuous:  # 连续值时，conditional_pro 储存的就是 [mean, var] 即均值和方差
            xi1_mean = np.mean(xi1)
            xi1_var = np.var(xi1)
            xi0_mean = np.mean(xi0)
            xi0_var = np.var(xi0)

            p1_list.append(p_xi(is_continuous, [xi1_mean, xi1_var]))
            p0_list.append(p_xi(is_continuous, [xi0_mean, xi0_var]))
        else:  # 离散值时直接计算各类别的条件概率
            unique_value = xi.unique()  # 取值情况
            nvalue = len(unique_value)  # 取值个数

            xi1_value_count = pd.value_counts(xi1).reindex(unique_value).fillna(0) + 1  # 计算正样本中，该属性每个取值的数量，并且加1，即拉普拉斯平滑
            xi0_value_count = pd.value_counts(xi0).reindex(unique_value).fillna(0) + 1

            p1_list.append(p_xi(is_continuous, xi1_value_count / (m1 + nvalue)))
            p0_list.append(p_xi(is_continuous, xi0_value_count / (m0 + nvalue)))
    print(p1_list)
    return p1, p1_list, p0_list



$$
p\left(x_{i} \mid c\right)=\frac{1}{\sqrt{2 \pi} \sigma_{c, i}} \exp \left(-\frac{\left(x_{i}-\mu_{c, i}\right)^{2}}{2 \sigma_{c, i}^{2}}\right)
$$

In [72]:

def predict_nb(x, p1, p1_list, p0_list):
    '''
    预测函数
    输入：
        x:一个样本
    返回：
        预测结果和概率
    '''
    n = len(x)

    x_p1 = p1
    x_p0 = 1 - p1
    for i in range(n):
        p1_xi = p1_list[i]
        p0_xi = p0_list[i]

        if p1_xi.is_continuous:
            mean1, var1 = p1_xi.conditional_pro
            mean0, var0 = p0_xi.conditional_pro
            x_p1 += 1 / (np.sqrt(2 * np.pi) * var1) * np.exp(- (x[i] - mean1) ** 2 / (2 * var1 ** 2))
            x_p0 += 1 / (np.sqrt(2 * np.pi) * var0) * np.exp(- (x[i] - mean0) ** 2 / (2 * var0 ** 2))
        else:
            x_p1 += p1_xi.conditional_pro[x[i]]
            x_p0 += p0_xi.conditional_pro[x[i]]

    if x_p1 > x_p0:
        return "是,概率为："+str(x_p1)
    else:
        return "否，概率为："+str(x_p0)


In [73]:


if __name__ == '__main__':
    data_path = "watermelon3_0_Ch.csv"
    data = pd.read_csv(data_path, index_col=0)

    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    p1, p1_list, p0_list = train_nb(X, y)

    x_test = X.iloc[0, :] 

    print(predict_nb(x_test, p1, p1_list, p0_list))

[色泽(is_continuous=False, conditional_pro=青绿    0.363636
乌黑    0.454545
浅白    0.181818
Name: 色泽, dtype: float64), 根蒂(is_continuous=False, conditional_pro=蜷缩    0.545455
稍蜷    0.363636
硬挺    0.090909
Name: 根蒂, dtype: float64), 敲声(is_continuous=False, conditional_pro=浊响    0.636364
沉闷    0.272727
清脆    0.090909
Name: 敲声, dtype: float64), 纹理(is_continuous=False, conditional_pro=清晰    0.727273
稍糊    0.181818
模糊    0.090909
Name: 纹理, dtype: float64), 脐部(is_continuous=False, conditional_pro=凹陷    0.545455
稍凹    0.363636
平坦    0.090909
Name: 脐部, dtype: float64), 触感(is_continuous=False, conditional_pro=硬滑    0.7
软粘    0.3
Name: 触感, dtype: float64), 密度(is_continuous=True, conditional_pro=[0.5737500000000001, 0.014608437499999998]), 含糖率(is_continuous=True, conditional_pro=[0.27875, 0.008912437500000002])]
是,概率为：3.991866028708143
