# 代码运行须知

本次项目代码全部在同一个文件中，包含数据集读取、预处理，模型搭建、模型参数调整等，在每段代码前都已标注本段代码的作用。

如需具体测试各种数据预处理类型和模型优化的话，请根据每段代码前的注释将对应的代码段**解除注释**，同时保证其他代码段处于**被注释**状态即可。

基本的模型(KNN, LogisticRegression)建议一次运行一个，也可以同时运行多个并输出，但此时不保证输出结果的美观。

如需更换数据集，只需在Cell 2(第二个方块)中将 pd.read_csv()中的文件名称修改即可。

以上

In [710]:
import numpy as np


# KNN模型构建
class KNN():

    def __init__(self, x_train, y_train, metric_type): 
        """
            此函数为模型初始化函数
            输入的参数分别为：
            x_train: 输入训练集
            y_train: 输入训练集对应的输出训练集
            metric_type: 模型选定的范数类型
        """

        self.metric_type = metric_type
        self.x_train = x_train
        self.y_train = y_train

    def distance(self, metric_type, p1, p2): 
        """
            此函数用于计算两点间的距离
            参数metric_type用于确定范数类型
        """

        diff = p1 - p2
        if metric_type == "L1":
            dist = np.sum(abs(diff))

        if metric_type == "L2":
            dist = np.sqrt(np.sum(diff ** 2))
        
        if metric_type == "L-inf":
            dist = np.max(abs(diff))
        
        return dist

    def Neighbors(self, K, sample):
        """
            此函数用于遍历训练集中的点并求出最近的K个点
            返回值:
            K近邻的类型
        """

        dis = np.array([(i, self.distance(self.metric_type, sample, self.x_train[i])) for i in range(len(self.x_train))])
        sorted_distance = sorted(dis, key=lambda x: x[1])
        neighbors = [self.y_train[int(sorted_distance[i][0])] for i in range(K)]
        return neighbors

        
    def Majority(self ,neighbors):
        """
            此函数将根据K近邻投票判决测试集的类型
            输入参数:
            neighors:K近邻的类型(已排序)
            返回值：
            预测结果
        """
        
        elements = np.unique(neighbors, return_counts=True)
        max_val = np.max(elements[1])
        predicted = elements[0][sorted(np.where(elements[1] == max_val)[0])[0]]
        return predicted

    def predict(self, K, x_test):
        """
            此函数将直接输出模型预测结果
            输入参数：
            K: KNN中的K
            x_test:测试集
        """
        
        predictions = np.array([self.Majority(self.Neighbors(K, x_test[i])) for i in range(len(x_test))])
        return predictions

In [711]:
import pandas as pd
import sklearn.model_selection
from sklearn.preprocessing import StandardScaler


## 读取数据集
# csv = pd.read_csv("./origin_breast_cancer_data.csv")  
csv = pd.read_csv("./breast_cancer_data_357B_100M.csv")  
csv = csv.drop(columns=["id"], axis=1)
y = csv["diagnosis"]
y = y.replace('B', 0)
y = y.replace("M", 1)
# x = csv.drop(columns=["diagnosis"], axis=1)



# Origin数据集剪枝结果
# x = csv[["texture_mean", "concave points_mean", 
#         "area_se", "radius_worst", 
#         "texture_worst", "perimeter_worst", 
#         "area_worst", "smoothness_worst", 
#         "concavity_worst", "concave points_worst"]]


# Origin数据集剪枝结果 + 8NN特殊项
x = csv[["texture_mean", "area_mean", "concave points_mean", 
        "area_se", "radius_worst", 
        "texture_worst", "perimeter_worst", 
        "area_worst", "smoothness_worst", 
        "concavity_worst", "concave points_worst"]]
        

# 失衡数据集剪枝结果
# x = csv[["texture_mean",  "fractal_dimension_se", 
#         "symmetry_se", "texture_worst", 
#         "perimeter_worst", "area_worst", 
#         "smoothness_worst", "concave points_worst"]]

stand = StandardScaler()


以下为各种预处理数据的手段：
1. 对数据进行统计学分析并*剔除*不符合$3\sigma$原则的项(整行消除)

2. 对数据进行统计学分析并*替换*不符合$3\sigma$原则的项，替换公式为$x=|ave_x + rand * standard-variance_x|$，其中$rand \sim N(0,1)$(添加绝对值是因为所用用到的样本数值均为正数)

In [712]:
x = x.to_numpy()
y = y.to_numpy()



'''
    删除偏离项
'''

# i = 0
# while i < x.shape[1]:

#     ave = np.average(x[:,i])
#     var = 0
#     j = 0

#     for k in range(x.shape[0]):
#         var += (x[k, i] - ave) ** 2
#     std = np.sqrt(var / (1.0 * x.shape[0]))
#     # print("ave for: ", ave)
#     # print("var for: ", std)

#     while j < x.shape[0]:
#         if x[j,i] < (ave - 3 * std) or x[j,i] > (ave + 3 * std):
#             # print(" 删除元素: " , i, " column ", x[j,i])
#             x = np.delete(x, j, 0)
#             y = np.delete(y, j, 0)
#             j = j - 1
#             # print(x.shape[0])
#         j = j + 1
#     i += 1
#     # print("i ", i)



'''
    修正偏离项
'''

# i = 0
# while i < x.shape[1]:

#     ave = np.average(x[:,i])
#     var = 0
#     j = 0

#     for k in range(x.shape[0]):
#         var += (x[k, i] - ave) ** 2
#     std = np.sqrt(var / (1.0 * x.shape[0]))
#     # print("ave for: ", ave)
#     # print("var for: ", std)
    
#     while j < x.shape[0]:
#         if x[j,i] < (ave - 3 * std) or x[j,i] > (ave + 3 * std):
#             # print("  修改元素: " , i, " column ", x[j,i])
#             ran = np.random.normal()
#             x[j,i] = np.abs(ave + var * ran)
#             # print("修正后的X值为 ", x[j,i])
#         j = j + 1
#     i += 1
#     # print("i ", i)
#     # print("----------------------")
#     # print()

In [713]:
from sklearn.linear_model import LogisticRegression   ## 逻辑回归模型
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import warnings


# warnings.filterwarnings("ignore")
# Logistic = LogisticRegression()


'''
    更改分类权重
'''
Logistic = LogisticRegression(class_weight="balanced")
# Logistic = LogisticRegression(class_weight={0:0.2, 1:0.8})


'''
    正则化回归模型
'''
# Logistic = LogisticRegression(solver="liblinear",penalty="l1")
# Logistic = LogisticRegression(solver="liblinear",penalty="l2")


In [714]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
# import time

step = 2000 ## 重复测试次数
k_range = 8
k_vals = ['L1','L2','L-inf']  ## 范数类型



'''
    KNN结果初始化
'''
# ave_accurate = [np.zeros(k_range) for i in range(3)] ## 记录KNN中K从4到12变化评估参数的变化情况
# ave_recall = [np.zeros(k_range) for i in range(3)]
# ave_precision = [np.zeros(k_range) for i in range(3)]
# ave_F1 = [np.zeros(k_range) for i in range(3)]

# worst_accurate = [np.ones(k_range) for i in range(3)]
# worst_recall = [np.ones(k_range) for i in range(3)]
# worst_precision = [np.ones(k_range) for i in range(3)]
# worst_F1 = [np.ones(k_range) for i in range(3)]


'''
    8NN结果初始化
'''
ave_accurate = 0 
ave_recall = 0
ave_precision = 0
ave_F1 = 0

worst_accurate = 1
worst_recall = 1
worst_precision = 1
worst_F1 = 1



'''
    逻辑回归结果初始化
'''  
aveLogScore = 0
aveLogRecall = 0
aveLogF1 = 0
aveLogPrecision = 0

wLS = 1
wLR = 1
wLF = 1
wLP = 1


# importance = np.zeros(30)


'''
    模型训练与结果预测
'''  
# begin = time.time()
for i in range(step):    

    # 数据集分割
    
    train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(x, y, test_size=0.2)  ## 数据集分割，其中20%为测试集，80%为训练集
    # 修改数据类型用于模型运算
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)




    '''
        逻辑回归模型训练与预测
    '''  
    Logistic.fit(train_x, train_y)
    Logy_pred = Logistic.predict(test_x)


    LogScore = accuracy_score(y_true = test_y, y_pred = Logy_pred)
    LogRecall = recall_score(y_true = test_y, y_pred = Logy_pred)
    LogF1 = f1_score(y_true = test_y, y_pred = Logy_pred)
    LogPrecision = precision_score(y_true = test_y, y_pred = Logy_pred)

    aveLogScore += LogScore
    aveLogF1 += LogF1
    aveLogRecall += LogRecall
    aveLogPrecision += LogPrecision

    if wLS > LogScore:
        wLS = LogScore
    
    if wLR > LogRecall:
        wLR = LogRecall
    
    if wLF > LogF1:
        wLF = LogF1
    
    if wLP > LogPrecision:
        wLP = LogPrecision



    '''
        KNN模型训练与预测
    '''  
    # for k in range(1, k_range + 1):
    #     for j in range(len(k_vals)):
    #         knn = KNN(train_x, train_y, k_vals[j])
    #         KNNy_pred = knn.predict(k + 3, test_x)

    #         result = np.zeros(4) # 按顺序记录TP TN FP FN
    #         for l in range(len(test_y)):
    #             if test_y[l] == 1 and KNNy_pred[l] == 1: # 预测为正且预测正确 TP
    #                 result[0] += 1 
    #             elif test_y[l] == 0 and KNNy_pred[l] == 0: # 预测为负且预测正确 TN
    #                 result[1] += 1
    #             elif test_y[l] == 0 and KNNy_pred[l] == 1: # 预测为正但预测错误 FP
    #                 result[2] += 1
    #             elif test_y[l] == 1 and KNNy_pred[l] == 0: # 预测为负但预测错误 FN
    #                 result[3] += 1

    #         recall = (1.0 * result[0]) / (result[0] + result[3])
    #         precision = (1.0 * result[0]) / (result[0] + result[2])
    #         F1 = 2.0 * (recall * precision) / (recall + precision)
    #         KNNScore = np.sum(test_y == KNNy_pred) / len(test_y)
            
    #         # print("Accuracy for LogisticRegresion is: ", LogScore)
    #         ave_accurate[j][k-1] += KNNScore
    #         ave_recall[j][k-1] += recall
    #         ave_precision[j][k-1] += precision
    #         ave_F1[j][k-1] += F1

    #         if KNNScore < worst_accurate[j][k-1]:
    #             worst_accurate[j][k-1] = KNNScore

    #         if recall < worst_recall[j][k-1]:
    #             worst_recall[j][k-1] = recall
            
    #         if precision < worst_precision[j][k-1]:
    #             worst_precision[j][k-1] = precision
            
    #         if F1 < worst_F1[j][k-1]:
    #             worst_F1[j][k-1] = F1


    

    '''
        8NN模型训练与预测
    '''  
    # knn = KNN(train_x, train_y, "L2")
    # KNNy_pred = knn.predict(8, test_x)

    # result = np.zeros(4) # 按顺序记录TP TN FP FN
    # for l in range(len(test_y)):
    #     if test_y[l] == 1 and KNNy_pred[l] == 1: # 预测为正且预测正确 TP
    #         result[0] += 1 
    #     elif test_y[l] == 0 and KNNy_pred[l] == 0: # 预测为负且预测正确 TN
    #         result[1] += 1
    #     elif test_y[l] == 0 and KNNy_pred[l] == 1: # 预测为正但预测错误 FP
    #         result[2] += 1
    #     elif test_y[l] == 1 and KNNy_pred[l] == 0: # 预测为负但预测错误 FN
    #         result[3] += 1

    # recall = (1.0 * result[0]) / (result[0] + result[3])
    # precision = (1.0 * result[0]) / (result[0] + result[2])
    # F1 = 2.0 * (recall * precision) / (recall + precision)
    # KNNScore = np.sum(test_y == KNNy_pred) / len(test_y)
            
    # # print("Accuracy for LogisticRegresion is: ", LogScore)
    # ave_accurate += KNNScore
    # ave_recall += recall
    # ave_precision += precision
    # ave_F1 += F1

    # if KNNScore < worst_accurate:
    #     worst_accurate = KNNScore

    # if recall < worst_recall:
    #     worst_recall = recall
            
    # if precision < worst_precision:
    #     worst_precision = precision
            
    # if F1 < worst_F1:
    #     worst_F1 = F1




    '''
        决策树辅助测试
    '''  
#     model = DecisionTreeClassifier()
#     model.fit(train_x,train_y) 
#     y_pre = model.predict(test_x)
#     mScore = accuracy_score(y_true=test_y, y_pred=y_pre)
#     importance += model.feature_importances_


# importance = importance / step
# print("Importance for each attributes:")
# print(importance)
# for i in range(len(importance)):
#     if importance[i]> 0.005:
#         print("The most important attribute is: ", i + 1)



'''
    逻辑回归结果输出
'''  
print("Average accuracy for LLR is: ", round(aveLogScore / step, 4))            
print("Average recall for LR is: ", round(aveLogRecall / step, 4))
print("Average F1 score for LR  is: ", round(aveLogF1 / step, 4))
print("Average Precision for LR is: ", round(aveLogPrecision / step, 4))

print("Worst accuracy for LR is: ", round(wLS, 4))            
print("Worst recall for LR is: ", round(wLR, 4))
print("Worst F1 score for LR is: ", round(wLF, 4))
print("Worst Precision for LR is: ", round(wLP, 4))



'''
    8NN结果输出
'''  
# print("Average accuracy for 8NN is: ", round(ave_accurate / step, 4))            
# print("Average recall for 8NN is: ", round(ave_recall / step, 4))
# print("Average F1 score for 8NN is: ", round(ave_F1 / step, 4))
# print("Average Precision for 8NN is: ", round(ave_precision / step, 4))

# print("Worst accuracy for 8NN is: ", round(worst_accurate, 4))            
# print("Worst recall for 8NN is: ", round(worst_recall, 4))
# print("Worst F1 score for 8NN is: ", round(worst_F1, 4))
# print("Worst Precision for 8NN is: ", round(worst_precision, 4))



'''
    KNN结果输出
'''
# for i in range(3):

    # plt.figure()
    # str = "Average accuracy for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = ave_accurate[i] / (1.0 * step)
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()

    # plt.figure()
    # str = "Worst accuracy for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = worst_accurate[i]
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()



    # plt.figure()
    # str = "Average recall for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = ave_recall[i] / (1.0 * step)
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()

    # plt.figure()
    # str = "Worst recall for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = worst_recall[i]
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()



    # plt.figure()
    # str = "Average precision for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = ave_precision[i] / (1.0 * step)
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()

    # plt.figure()
    # str = "Worst precision for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = worst_precision[i]
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()



    # plt.figure()
    # str = "Average F1 score for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = ave_F1[i] / (1.0 * step)
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()    

    # plt.figure()
    # str = "Worst F1 score for " + k_vals[i]
    # plt.title('%s'%str)
    # accur = worst_F1[i]
    # axis = np.arange(4, k_range + 4)
    # plt.plot(axis,accur)
    # plt.show()    

Average accuracy for LLR is:  0.9526
Average recall for LR is:  0.932
Average F1 score for LR  is:  0.894
Average Precision for LR is:  0.8633
Worst accuracy for LR is:  0.8587
Worst recall for LR is:  0.6316
Worst F1 score for LR is:  0.6667
Worst Precision for LR is:  0.5714
