In [77]:
import matplotlib.pyplot as plt
import numpy as np
import copy
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import type_of_target
from collections import namedtuple
plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题

In [78]:

def train_nb(X, y):
    '''
    拉普拉斯修正的朴素贝叶斯分类训练
    输入：
        X:样本特征
        y:样本标签
    返回：
        p1:好瓜概率
        p1_list:正例中，各属性的条件概率，格式为色泽(is_continuous=False, conditional_pro=青绿  0.363636 乌黑  0.454545 浅白  0.181818)
        p0_list:负例中，各属性的条件概率
    '''
    m, n = X.shape
    p1 = (len(y[y == 1]) + 1) / (m + 2)  # 拉普拉斯平滑
    print(p1)
    p1_list = []  # 用于保存正例下各属性的条件概率
    p0_list = []

    X1 = X[y == 1]
    X0 = X[y == 0]

    m1, _ = X1.shape
    m0, _ = X0.shape

    for i in range(n): # 遍历数据集每个特征列
        xi = X.iloc[:, i]
        p_xi = namedtuple(X.columns[i], ['is_continuous', 'conditional_pro'])  # 用于储存每个变量的情况
        is_continuous = type_of_target(xi) == 'continuous'
        xi1 = X1.iloc[:, i]
        xi0 = X0.iloc[:, i]
        if is_continuous:  # 连续值时，conditional_pro 储存的就是 [mean, var] 即均值和方差
            xi1_mean = np.mean(xi1)
            xi1_var = np.var(xi1)
            xi0_mean = np.mean(xi0)
            xi0_var = np.var(xi0)

            p1_list.append(p_xi(is_continuous, [xi1_mean, xi1_var]))
            p0_list.append(p_xi(is_continuous, [xi0_mean, xi0_var]))
        else:  # 离散值时直接计算各类别的条件概率
            unique_value = xi.unique()  # 取值情况
            nvalue = len(unique_value)  # 取值个数

            xi1_value_count = pd.value_counts(xi1).reindex(unique_value).fillna(0) + 1  # 计算正样本中，该属性每个取值的数量，并且加1，即拉普拉斯平滑
            xi0_value_count = pd.value_counts(xi0).reindex(unique_value).fillna(0) + 1

            p1_list.append(p_xi(is_continuous, xi1_value_count / (m1 + nvalue)))
            p0_list.append(p_xi(is_continuous, xi0_value_count / (m0 + nvalue)))
    print(p1_list)
    return p1, p1_list, p0_list



In [79]:

def predict_nb(x, p1, p1_list, p0_list):
    '''
    预测函数
    输入：
        x:一个样本
    返回：
        预测结果和概率
    '''
    n = len(x)

    x_p1 = p1
    x_p0 = 1 - p1
    for i in range(n):
        p1_xi = p1_list[i]
        p0_xi = p0_list[i]

        if p1_xi.is_continuous:
            mean1, var1 = p1_xi.conditional_pro
            mean0, var0 = p0_xi.conditional_pro
            x_p1 += 1 / (np.sqrt(2 * np.pi) * var1) * np.exp(- (x[i] - mean1) ** 2 / (2 * var1 ** 2))
            x_p0 += 1 / (np.sqrt(2 * np.pi) * var0) * np.exp(- (x[i] - mean0) ** 2 / (2 * var0 ** 2))
        else:
            x_p1 += p1_xi.conditional_pro[x[i]]
            x_p0 += p0_xi.conditional_pro[x[i]]

    if x_p1 > x_p0:
        return 1
    else:
        return 0


In [80]:
X=pd.read_csv("X.csv",index_col=0)
y=pd.read_csv("y.csv",index_col=0)
y=y["status"]
m,n=X.shape
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=1)
p1, p1_list, p0_list = train_nb(X_train, y_train)
y_pred=[]
for i in range(len(X_test)):
    X_test_one = X_test.iloc[i, :]
    y_pred.append(predict_nb(X_test_one, p1, p1_list, p0_list))
print("Accurancy:",np.mean(y_pred==y_test))

0.7125748502994012
[gender(is_continuous=False, conditional_pro=0    0.308333
1    0.691667
Name: gender, dtype: float64), ssc_p(is_continuous=True, conditional_pro=[70.94627118644065, 74.40820474001723]), hsc_p(is_continuous=True, conditional_pro=[69.23838983050848, 78.18790164464238]), degree_p(is_continuous=True, conditional_pro=[68.38669491525422, 42.32035094082161]), workex(is_continuous=False, conditional_pro=0    0.558333
1    0.441667
Name: workex, dtype: float64), etest_p(is_continuous=True, conditional_pro=[72.59788135593222, 181.80710144355078]), specialisation(is_continuous=False, conditional_pro=0    0.65
1    0.35
Name: specialisation, dtype: float64), mba_p(is_continuous=True, conditional_pro=[62.23203389830505, 33.56381959207125]), dummy_Arts(is_continuous=False, conditional_pro=1    0.05
0    0.95
Name: dummy_Arts, dtype: float64), dummy_Commerce(is_continuous=False, conditional_pro=0    0.475
1    0.525
Name: dummy_Commerce, dtype: float64), dummy_Science(is_continuou