# 06 网格搜索和更多kNN中的超参数

#### 更多距离的定义
1. 向量空间余弦相似度(Cosine Similarity)
2. 调整余弦相似度(Adjust Cosine Similarity)
3. 皮尔森相关系数(Pearson Correlation Coefficients)
4. Jaccard 相关系数

In [2]:
import numpy as np
import sklearn.datasets as datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [4]:
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [6]:
sk_knn_clf = KNeighborsClassifier(n_neighbors=4, weights='uniform')
sk_knn_clf.fit(X_train, y_train)
sk_knn_clf.score(X_test, y_test)

0.9916666666666667

### 1. 网格搜索练习

In [7]:
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': np.arange(1, 11)
    },
    {
        'weights': ['distance'],
        'n_neighbors': np.arange(1, 11),
        'p': np.logspace(1, 32, 7)
    }
]

In [10]:
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid)

In [11]:
%%time
grid_search.fit(X_train, y_train)

CPU times: user 8min 39s, sys: 83 ms, total: 8min 39s
Wall time: 8min 40s


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])}, {'weights': ['distance'], 'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'p': array([  1.00000e+01,   1.46780e+06,   2.15443e+11,   3.16228e+16,
         4.64159e+21,   6.81292e+26,   1.00000e+32])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [13]:
grid_search.best_score_

0.98469032707028536

参数:
- n_jobs: 参与并行计算的核数
- verbose: 打印输出的信息量