In [1]:
import numpy as np
import math


def iris_type(s):
    it = {b'Iris-setosa': 1, b'Iris-versicolor': 0, b'Iris-virginica': 0}
    return it[s]


#读入数据
iris = np.loadtxt("iris.data", delimiter=',', converters={4: iris_type})
np.random.shuffle(iris)  #打乱
num = iris.shape[0]  #数据总数
dim = iris.shape[1] - 1  #维数
sample_num = round(num / 5)

In [2]:
def LinearDiscriminantAnalysis(training_set, test_set, error_list):
    training_set1 = np.array(
        [array[0:dim] for array in training_set if array[dim] == 1]).T
    training_set0 = np.array(
        [array[0:dim] for array in training_set if array[dim] == 0]).T
    sigma0 = np.cov(training_set0)
    miu0 = training_set0.mean(1)
    sigma1 = np.cov(training_set1)
    miu1 = training_set1.mean(1)
    w = np.dot(np.linalg.inv(sigma1 + sigma0), miu0 - miu1)
    miu0_hat = np.dot(w, miu0)
    miu1_hat = np.dot(w, miu1)
    #验证测试集
    test_num = len(test_set)
    error_num = 0
    for i in range(test_num):
        test_value = np.dot(w, test_set[i, :-1])
        if (test_value - miu0_hat)**2 < (test_value - miu1_hat)**2:  #测试得是0类
            if test_set[i, -1] != 0:
                error_num += 1
                print("数据" + str(i) + "验证错误")
        else:  #测试得是1类
            if test_set[i, -1] == 0:
                error_num += 1
                print("数据" + str(i) + "验证错误")
    error_rate = error_num / test_num
    error_list.append(error_rate)
    print("第" + str(len(error_list)) + "重测试错误率为" + str(error_rate))
    print()

In [3]:
#iris——LDA
errorlist = []
LinearDiscriminantAnalysis(np.delete(iris, range(0, sample_num), 0),
                           iris[0:sample_num], errorlist)
LinearDiscriminantAnalysis(
    np.delete(iris, range(sample_num, 2 * sample_num), 0),
    iris[sample_num:2 * sample_num], errorlist)
LinearDiscriminantAnalysis(
    np.delete(iris, range(2 * sample_num, 3 * sample_num), 0),
    iris[2 * sample_num:3 * sample_num], errorlist)
LinearDiscriminantAnalysis(
    np.delete(iris, range(3 * sample_num, 4 * sample_num), 0),
    iris[3 * sample_num:4 * sample_num], errorlist)
LinearDiscriminantAnalysis(np.delete(iris, range(4 * sample_num, num), 0),
                           iris[4 * sample_num:], errorlist)
print("平均的错误率为" + str(np.mean(errorlist)))

第1重测试错误率为0.0

第2重测试错误率为0.0

第3重测试错误率为0.0

第4重测试错误率为0.0

第5重测试错误率为0.0

平均的错误率为0.0


In [4]:
def GaussianProbability(x, miu, sigma):
    pro = math.exp(-(x - miu)**2 / 2 / sigma**2) / (2 * math.pi)**0.5 / sigma
    return pro


def NaiveBayes(training_set, test_set, error_list):
    training_set1 = np.array(
        [array[0:dim] for array in training_set if array[dim] == 1]).T
    training_set0 = np.array(
        [array[0:dim] for array in training_set if array[dim] == 0]).T
    training_num0 = training_set0.shape[1]
    training_num1 = training_set1.shape[1]
    priori_pro1 = training_num1 / len(training_set)  #计算先验概率
    priori_pro0 = training_num0 / len(training_set)
    #假设连续属性为高斯分布，计算它的各个参数
    miu = np.empty((dim, 2))
    sigma = np.empty((dim, 2))
    for i in range(dim):
        miu[i, 0] = np.mean(training_set0[i])
        miu[i, 1] = np.mean(training_set1[i])
        sigma[i, 0] = np.var(training_set0[i])
        sigma[i, 1] = np.var(training_set1[i])
    #验证测试集
    test_num = len(test_set)
    error_num = 0
    for i in range(test_num):
        posterior_pro0 = priori_pro0
        posterior_pro1 = priori_pro1
        for j in range(dim):
            posterior_pro0 *= GaussianProbability(test_set[i, j], miu[j, 0],
                                                  sigma[j, 0])
            posterior_pro1 *= GaussianProbability(test_set[i, j], miu[j, 1],
                                                  sigma[j, 1])
        if (posterior_pro0 > posterior_pro1):  #测试得为0类
            if test_set[i, dim] != 0:
                error_num += 1
                print("数据" + str(i) + "验证错误")
        else:  #测试得为1类
            if test_set[i, dim] == 0:
                error_num += 1
                print("数据" + str(i) + "验证错误")
    error_rate = error_num / test_num
    error_list.append(error_rate)
    print("第" + str(len(error_list)) + "重测试错误率为" + str(error_rate))
    print()

In [5]:
#iris——Naïve Bayes
errorlist = []
NaiveBayes(np.delete(iris, range(0, sample_num), 0), iris[0:sample_num],
           errorlist)
NaiveBayes(np.delete(iris, range(sample_num, 2 * sample_num), 0),
           iris[sample_num:2 * sample_num], errorlist)
NaiveBayes(np.delete(iris, range(2 * sample_num, 3 * sample_num), 0),
           iris[2 * sample_num:3 * sample_num], errorlist)
NaiveBayes(np.delete(iris, range(3 * sample_num, 4 * sample_num), 0),
           iris[3 * sample_num:4 * sample_num], errorlist)
NaiveBayes(np.delete(iris, range(4 * sample_num, num), 0),
           iris[4 * sample_num:], errorlist)
print("平均的错误率为" + str(np.mean(errorlist)))

数据6验证错误
数据7验证错误
数据21验证错误
数据28验证错误
第1重测试错误率为0.13333333333333333

数据1验证错误
数据4验证错误
数据8验证错误
数据17验证错误
数据22验证错误
第2重测试错误率为0.16666666666666666

数据4验证错误
第3重测试错误率为0.03333333333333333

第4重测试错误率为0.0

数据1验证错误
数据4验证错误
数据15验证错误
数据22验证错误
数据28验证错误
数据29验证错误
第5重测试错误率为0.2

平均的错误率为0.10666666666666666


In [6]:
from sklearn import svm


def GetAccuracy(prediction, reality):
    print("预测结果为：", prediction)
    print("实际结果为：", reality)
    error = 0
    for i in range(len(prediction)):
        if prediction[i] != reality[i]:
            error += 1
    return error / len(prediction)


def SVM_iris(training_set, test_set, error_list):
    clf = svm.SVC()
    clf.fit(training_set[:, :-1], training_set[:, -1])
    result = clf.predict(test_set[:, :-1])
    error_rate = GetAccuracy(result, test_set[:, -1])
    error_list.append(error_rate)
    print("第" + str(len(error_list)) + "重测试错误率为" + str(error_rate))
    print()

In [7]:
#iris——SVM
errorlist = []
SVM_iris(np.delete(iris, range(0, sample_num), 0), iris[0:sample_num],
         errorlist)
SVM_iris(np.delete(iris, range(sample_num, 2 * sample_num), 0),
         iris[sample_num:2 * sample_num], errorlist)
SVM_iris(np.delete(iris, range(2 * sample_num, 3 * sample_num), 0),
         iris[2 * sample_num:3 * sample_num], errorlist)
SVM_iris(np.delete(iris, range(3 * sample_num, 4 * sample_num), 0),
         iris[3 * sample_num:4 * sample_num], errorlist)
SVM_iris(np.delete(iris, range(4 * sample_num, num), 0), iris[4 * sample_num:],
         errorlist)
print("平均的错误率为" + str(np.mean(errorlist)))

预测结果为： [1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 1. 0.]
实际结果为： [1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 1. 0.]
第1重测试错误率为0.0

预测结果为： [0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0.]
实际结果为： [0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0.]
第2重测试错误率为0.0

预测结果为： [0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 1.]
实际结果为： [0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 1.]
第3重测试错误率为0.0

预测结果为： [0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0.
 1. 1. 1. 0. 0. 1.]
实际结果为： [0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0.
 1. 1. 1. 0. 0. 1.]
第4重测试错误率为0.0

预测结果为： [0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 0. 1. 1.]
实际结果为： [0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 