# 超参数(在运行kNN算法之前传入的参数)

+ 超参数：在算法运行前需要决定的参数
+ 模型参数：算法过程中学习的参数

> kNN算法没有模型参数，kNN算法中的k是典型的参数

如何寻找号的超参数：
+ 领域知识
+ 经验数值
+ 实验搜索

In [2]:
import numpy as np
from sklearn import datasets

In [3]:
digits = datasets.load_digits() # 数字识别算法集，结果只有0~9
X = digits.data
y = digits.target

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)

0.9916666666666667

# 寻找最好的k

In [7]:
# 最好的分数，不断寻找最好的k来得到最高的score记录入best_score
best_score = 0.0

In [8]:
# 临近点最好的k
best_k = -1

In [9]:
# for循环去找合适的参数(后面会看到sklearn提供了现成的方法来直接拿到最好的参数值)
for k in range(1, 11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    score = knn_clf.score(X_test, y_test)
    if score > best_score:
        best_k = k
        best_score = score
        
print("best_k = ", best_k)
print("best_score = ", best_score) # 如果过大或者过小就应该适当扩大k的范围

best_k =  3
best_score =  0.991666666667


# 另一个超参数：距离的权重
> k近邻算法的权重参数的含义,类似加权图中的边的权重值
![k近邻算法的权重参数的含义](images/k近邻算法的权重参数的含义.jpg)



In [11]:
best_method = ""
best_score = 0.0
best_k = -1
for method in ["uniform", "distance"]: # 距离的权重
    for k in range(1, 11): # 取地临近点的个数
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_method = method
            best_k = k
            best_score = score
print("best_method = "+best_method)        
print("best_k = ", best_k)
print("best_score = ", best_score) # 如果过大或者过小就应该适当扩大k的范围

best_method = uniform
best_k =  3
best_score =  0.991666666667


# 另一个超参数：距离的权重

> 曼哈顿距离和欧拉距离的通用性提取---->明科夫斯基距离
![明科夫斯基距离1](images/明科夫斯基距离1.jpg)

> 曼哈顿距离和欧拉距离的通用性提取---->明科夫斯基距离
![明科夫斯基距离2](images/明科夫斯基距离2.jpg)


In [13]:
%%time
best_p = "" # 明科夫斯基的最佳参数
best_score = 0.0
best_k = -1
for k in range(1, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance",p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_p = p
            best_k = k
            best_score = score
print("best_p = ", best_p)        
print("best_k = ", best_k)
print("best_score = ", best_score) # 如果过大或者过小就应该适当扩大k的范围

best_p =  2
best_k =  3
best_score =  0.991666666667
CPU times: user 20.4 s, sys: 352 ms, total: 20.8 s
Wall time: 22.6 s
