# 1.导入相关库

In [5]:
import tensorflow as tf
import numpy as np
import pandas as pd
import operator
from sklearn.model_selection import RepeatedKFold

# 2.下载数据集

In [2]:
data_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']

def load_data(label_name):
    #创造一个本地数据集复样本
    data_path = tf.keras.utils.get_file(fname = data_URL.split('/')[-1], origin = data_URL)
    #转化为本地CSV文件
    data = pd.read_csv(filepath_or_buffer = data_path,  
                        names = CSV_COLUMN_NAMES,  # list of column names  
                        header = -1  # ignore the first row of the CSV file.  
                       )
    data_features, data_label = data, data.pop(label_name)
    return (data_features, data_label)

(data_feature, data_label) = load_data('Species')
feature = np.array(data_feature)
label = np.array(data_label)

# 3.使用TensorFlow设计K近邻模型

In [6]:
#设计KNN模型
def KNN_classifier(X_train, X_test, Y_train, Y_test, K, dims, dist_metric = 'L1'):
    # 计算图输入占位符
    xtrain = tf.placeholder(tf.float32)
    xtest = tf.placeholder(tf.float32)
    #print('Xtrain.shape: ', X_train.shape, ', X_val.shape: ',X_val.shape)
    #print('Ytrain.shape: ', Y_train.shape, ', Y_val.shape: ',Y_val.shape)
    # 使用 L1 距离进行最近邻计算
    # L1：dist = sum(|X1-X2|)  或 L2：dist=sqrt(sum(|X1-X2|^2))
    dist = tf.reduce_sum(tf.abs(tf.subtract(xtrain, xtest)), axis=1)
    #index = tf.arg_min(dist,0)
    if K is None:
        dim = tf.size(tf.shape(dist))#矩阵元素的个数
        if dim == 1:
            K = tf.shape(dist)[0]
        elif dim == 2:
            K = tf.shape(dist)[1]
        else:
            K = tf.shape(dist)[-1]
    # 从小到大排序，取前K个最小的
    value, index = tf.nn.top_k(-dist, k = K)
    value = -value
    # 初始化所有变量
    init = tf.global_variables_initializer()
    #定义一个正确率计算器
    Accuracy = 0 

    #执行会话
    with tf.Session() as sess:
        sess.run(init)
        # 只能循环地对测试样本进行预测
        for i in range(len(X_test)):
            idx = sess.run(index, feed_dict = {xtrain:X_train, xtest:X_test[i, :]})
            labelCount = {};
            for j in idx:
                tempLabel = Y_train[j];
                labelCount[tempLabel] = labelCount.get(tempLabel, 0) + 1;
                sortedCount = sorted(labelCount.items(), key = operator.itemgetter(1), reverse = True);
                #operator.itemgetter(1)意思是按照value值排序，即按照欧氏距离排序
            #return sortedCount[0][0]
            Predict_label = sortedCount[0][0];#输出标签出现最多的那个
            # 计算预测标签和正确标签用于比较
            #Klabels = np.argmax(Y_train[idx], axis = 0)  #统计K行01标签中为1的下标
            #print(Klabels)
            #Predict_label = np.argmax(np.bincount(Klabels))
            #统计下标数组中出现次数最多的值
            #print(Y_train[idx],'\n',Klabels,'\n',Predict_label)
            True_label = Y_test[i]
            #print("Test Sample", i, "Prediction label:", Predict_label, "True Class label:", True_label)
            # 计算精确度
            if Predict_label == True_label:
                Accuracy += 1
        Accuracy = Accuracy/len(X_test)
        #print("K值为",K,"Accuracy=",(Accuracy/len(X_test)))
    return Accuracy 

# 4.训练模型

In [7]:
#将所有数据分为训练集和测试集
ratio = 0.8
num_example = feature.shape[0]
s = np.int(num_example * ratio)
X_train = feature[:s]
Y_train = label[:s]
X_test = feature[s:]
Y_test = label[s:]
accuracy = KNN_classifier(X_train, X_test, Y_train, Y_test, K = 7, dims = 5, dist_metric = 'L1')
print('K = %d,accuracy = %.2f' %(7,accuracy))   

K = 7,accuracy = 0.77


# 5.验证模型

In [15]:
#验证集调参
folds = 5
X_folds = []
y_folds = []
 
X_folds = np.split(feature,folds)
y_folds = np.split(label,folds)

K_choices = [1, 2, 4, 5, 7, 8, 9, 10, 15, 20]
accuracy_of_K = {}
for k in K_choices:
    accuracy_of_K[k] = []
    
    
for i in range(folds):
    X_train = np.vstack(X_folds[:i] + X_folds[i+1:])
    X_val = X_folds[i]
    y_train = np.hstack(y_folds[:i] + y_folds[i+1:])
    y_val = y_folds[i]
    for k in K_choices:
        accuracy = KNN_classifier(X_train, X_val, y_train, y_val, K = k, dims = 5, dist_metric = 'L1')
        accuracy_of_K[k].append(accuracy)


results = []
K_best = []
for k in sorted(K_choices):
    accuracy_of_K[k] = np.array(accuracy_of_K[k])
    accuracy = np.sum(accuracy_of_K[k])
    accuracy = accuracy / 5
    results.append(accuracy)
    print('K = %d,accuracy = %.1f%%' %(k,accuracy*100))

acc = max(results)
for i in range(10):
    if results[i] == acc:
        K_best.append(K_choices[i])
        
for i in range(len(K_best)):
    print('最优K值为：', K_best[i])

K = 1,accuracy = 90.7%
K = 2,accuracy = 92.7%
K = 4,accuracy = 93.3%
K = 5,accuracy = 92.0%
K = 7,accuracy = 92.0%
K = 8,accuracy = 92.7%
K = 9,accuracy = 92.0%
K = 10,accuracy = 92.0%
K = 15,accuracy = 90.7%
K = 20,accuracy = 88.7%
最优K值为： 4
