In [1]:
import numpy as np
from collections import Counter

### 封装函数：计算明可夫斯基距离

In [2]:
def distance(a, b, p=2):
    '''计算距离'''
    return np.sum(np.abs(a - b) ** p) ** (1 / p)

### 封装函数：kNN 分类算法

In [3]:
def kNN_classify(X_train, y_train, X_predict, k=5, p=2):
    '''kNN分类器'''

    assert k > 0, 'k需要大于0'
    assert k <= y_train.shape[0], 'k需要小于或等于总的样本数'
    assert p > 0, 'p需要大于0'
    assert X_train.shape[0] == y_train.shape[0], 'X_train中样本数量需要与y_train的数量相同'
    assert X_train.shape[1] == X_predict.shape[1], '预测的特征数量需要等于样本的特征数量'

    return np.array([_predict(X_train, y_train, x, k, p) for x in X_predict])


def _predict(X_train, y_train, x, k, p):

    # 计算测试点与训练数据集中每个点的距离
    distances = [distance(item, x, p=p) for item in X_train]
    # 选取最近的 k 个点
    nearest = np.argsort(distances)[:k]
    k_labels = y_train[nearest]

    # 返回最近 k 个点中类别最多的点的类别
    return Counter(k_labels).most_common(1)[0][0]


导入留出法分离训练数据集和测试数据集

In [4]:
from ML.model_selection import train_test_split

使用鸢尾花数据

In [5]:
X = np.loadtxt('data/iris/x.txt')
y = np.loadtxt('data/iris/y.txt')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, seed=1)

### 预测

In [6]:
y_predict = kNN_classify(X_train, y_train, X_test)
y_predict

array([0., 1., 1., 0., 2., 1., 2., 0., 0., 2., 1., 0., 2., 1., 1., 0., 1.,
       1., 0., 0., 1., 1., 1., 0., 2., 1., 0., 0., 1., 2., 1., 2., 1., 2.,
       2., 0., 1.])

计算分类准确率

In [7]:
sum(np.array(y_predict) == y_test) / len(X_test)

1.0