In [1]:
#西瓜数据集——Naïve Bayes
import numpy as np
import math

In [2]:
def GaussianProbability(x, miu, sigma):
    pro = math.exp(-(x - miu)**2 / 2 / sigma**2) / (2 * math.pi)**0.5 / sigma
    return pro


def NaiveBayes(training_set, test_set, error_list):
    #训练集分类
    training_set1 = np.array(
        [array[0:dim] for array in training_set if array[dim] == 1]).T
    training_set0 = np.array(
        [array[0:dim] for array in training_set if array[dim] == 0]).T
    training_num0 = training_set0.shape[1]
    training_num1 = training_set1.shape[1]
    priori_pro1 = training_num1 / len(training_set)  #计算先验概率
    priori_pro0 = training_num0 / len(training_set)
    #计算离散属性的类别分布概率矩阵
    distribution_pro0 = np.empty((dim - 2, 3))
    distribution_pro1 = np.empty((dim - 2, 3))
    i = 0
    for array in training_set0[0:dim - 2]:
        distribution_pro0[i, 0] = sum(array == 0) / training_num0
        distribution_pro0[i, 1] = sum(array == 1) / training_num0
        distribution_pro0[i, 2] = sum(array == 2) / training_num0
        i += 1
    i = 0
    for array in training_set1[0:dim - 2]:
        distribution_pro1[i, 0] = sum(array == 0) / training_num1
        distribution_pro1[i, 1] = sum(array == 1) / training_num1
        distribution_pro1[i, 2] = sum(array == 2) / training_num1
        i += 1
    #假设连续属性为高斯分布，计算它的各个参数
    miu_density0 = np.mean(training_set0[dim - 2])
    miu_sugar0 = np.mean(training_set0[dim - 1])
    sigma_density0 = np.var(training_set0[dim - 2])
    sigma_sugar0 = np.var(training_set0[dim - 1])
    miu_density1 = np.mean(training_set1[dim - 2])
    miu_sugar1 = np.mean(training_set1[dim - 1])
    sigma_density1 = np.var(training_set1[dim - 2])
    sigma_sugar1 = np.var(training_set1[dim - 1])
    #验证测试集
    test_num = len(test_set)
    error_num = 0
    for i in range(test_num):
        posterior_pro0 = priori_pro0
        posterior_pro1 = priori_pro1
        for j in range(dim - 2):
            if test_set[i, j] == 0:
                posterior_pro0 *= distribution_pro0[j, 0]
                posterior_pro1 *= distribution_pro1[j, 0]
            elif test_set[i, j] == 1:
                posterior_pro0 *= distribution_pro0[j, 1]
                posterior_pro1 *= distribution_pro1[j, 1]
            else:
                posterior_pro0 *= distribution_pro0[j, 2]
                posterior_pro1 *= distribution_pro1[j, 2]
        posterior_pro0 *= GaussianProbability(test_set[i, dim - 2],
                                              miu_density0, sigma_density0)
        posterior_pro1 *= GaussianProbability(test_set[i, dim - 2],
                                              miu_density1, sigma_density1)
        posterior_pro0 *= GaussianProbability(test_set[i, dim - 1], miu_sugar0,
                                              sigma_sugar0)
        posterior_pro1 *= GaussianProbability(test_set[i, dim - 1], miu_sugar1,
                                              sigma_sugar1)
        if (posterior_pro0 > posterior_pro1):  #测试得不是好瓜
            if test_set[i, dim] == 0:
                print("数据" + str(i) + "验证正确")
            else:
                error_num += 1
                print("数据" + str(i) + "验证错误")
        else:  #测试得是好瓜
            if test_set[i, dim] == 0:
                error_num += 1
                print("数据" + str(i) + "验证错误")
            else:
                print("数据" + str(i) + "验证正确")
    error_rate = error_num / test_num
    error_list.append(error_rate)
    print("第" + str(len(error_list)) + "重测试错误率为" + str(error_rate))
    print()

In [3]:
watermelon = np.genfromtxt("watermelon.csv", delimiter=',')
np.random.shuffle(watermelon)  #打乱
num = watermelon.shape[0]  #数据总数
dim = watermelon.shape[1] - 1  #维数
sample_num = round(num / 5)

In [4]:
errorlist = []
#5重交叉验证
NaiveBayes(np.delete(watermelon, range(0, sample_num), 0),
           watermelon[0:sample_num], errorlist)
NaiveBayes(np.delete(watermelon, range(sample_num, 2 * sample_num), 0),
           watermelon[sample_num:2 * sample_num], errorlist)
NaiveBayes(np.delete(watermelon, range(2 * sample_num, 3 * sample_num), 0),
           watermelon[2 * sample_num:3 * sample_num], errorlist)
NaiveBayes(np.delete(watermelon, range(3 * sample_num, 4 * sample_num), 0),
           watermelon[3 * sample_num:4 * sample_num], errorlist)
NaiveBayes(np.delete(watermelon, range(4 * sample_num, num), 0),
           watermelon[4 * sample_num:], errorlist)
print("平均的错误率为" + str(np.mean(errorlist)))

数据0验证错误
数据1验证错误
数据2验证错误
第1重测试错误率为1.0

数据0验证错误
数据1验证错误
数据2验证正确
第2重测试错误率为0.6666666666666666

数据0验证正确
数据1验证正确
数据2验证错误
第3重测试错误率为0.3333333333333333

数据0验证错误
数据1验证错误
数据2验证正确
第4重测试错误率为0.6666666666666666

数据0验证正确
数据1验证正确
数据2验证正确
数据3验证错误
数据4验证正确
第5重测试错误率为0.2

平均的错误率为0.5733333333333334
