# KNN实现

In [1]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.datasets  import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
iris = load_iris()

In [9]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

In [13]:
df['class'] = iris.target
df['class'] = df['class'].map({0:iris.target_names[0],1:iris.target_names[1],2:iris.target_names[2]})
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [14]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [17]:
x = iris.data
y = iris.target.reshape(-1,1)

In [60]:
# 划分数据集
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=35, stratify=y)

In [27]:
# 距离计算
def l1_distance(a, b):
    return np.sum(np.abs(a - b) , axis=1)


def l2_distance(a,b):
    return np.sqrt(np.sum((a-b) ** 2), axis=1)

In [73]:
# 分类器实现
class kNN(object):
    
    def __init__(self,n_nieghbors = 1, dist_func=l1_distance):
        self.n_nieghbors = n_nieghbors
        self.dist_func = dist_func
        
    def fit(self,x,y):
        self.x_train = x
        self.y_train = y
        
    def predict(self, x):
        y_pred = np.zeros((x.shape[0], 1), dtype=self.y_train.dtype)
        
        for i, x_test in enumerate(x):
            # 计算距离
            distances = self.dist_func(self.x_train, x_test)
#             print('distances ----->',distances.shape)
            
            # 获取距离排序后的索引
            nn_index = np.argsort(distances)
#             print('nn_index ----->',nn_index)
#             print('self.y_train ----->',self.y_train)
            
            # 获取最近的k个点，并取出其分类
            nn_y = self.y_train[nn_index[:self.n_nieghbors]].ravel()
            
            # 统计出现频次最高的分类
            y_pred[i] = np.argmax(np.bincount(nn_y))
#             print(y_pred[i])
        
        return y_pred        

In [80]:
knn = kNN(n_nieghbors=5)

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)
# print(y_pred)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.9777777777777777
