**оценка классификатора и поиск хороших гиперпараметров для модели**


Евгений Борисов <esborisov@sevsu.ru>


# загружаем датасет

In [1]:
from sklearn.datasets import load_wine

X,target = load_wine(return_X_y=True)
display(X.shape)

(178, 13)

In [2]:
# разделяем датасет на учебные и тестовые данные
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,target,test_size=.5)
display( ( X_train.shape, X_test.shape) )

del X
del target

((89, 13), (89, 13))

# загружаем и обучаем модель классификатора

In [3]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier().fit(X_train,y_train)

# тестируем классификатор

In [4]:
from sklearn.metrics import classification_report

# таблица метрик качества классификации на тестовом наборе
print( classification_report(y_true=y_test, y_pred=model.predict(X_test) ) )

              precision    recall  f1-score   support

           0       0.75      1.00      0.86        30
           1       0.61      0.59      0.60        32
           2       0.50      0.33      0.40        27

    accuracy                           0.65        89
   macro avg       0.62      0.64      0.62        89
weighted avg       0.62      0.65      0.63        89



# поиск хороших гиперпараметров модели

In [5]:
from sklearn.model_selection import GridSearchCV

In [6]:
# параметры модели
help(KNeighborsClassifier)

Help on class KNeighborsClassifier in module sklearn.neighbors._classification:

class KNeighborsClassifier(sklearn.neighbors._base.KNeighborsMixin, sklearn.base.ClassifierMixin, sklearn.neighbors._base.NeighborsBase)
 |  KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
 |  
 |  Classifier implementing the k-nearest neighbors vote.
 |  
 |  Read more in the :ref:`User Guide <classification>`.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, default=5
 |      Number of neighbors to use by default for :meth:`kneighbors` queries.
 |  
 |  weights : {'uniform', 'distance'}, callable or None, default='uniform'
 |      Weight function used in prediction.  Possible values:
 |  
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of their distance.
 |        in this case, closer neighb

In [7]:
from scipy.spatial import distance
# dir(distance)
display( distance._METRICS_NAMES )

['braycurtis',
 'canberra',
 'chebyshev',
 'cityblock',
 'correlation',
 'cosine',
 'dice',
 'euclidean',
 'hamming',
 'jaccard',
 'jensenshannon',
 'kulczynski1',
 'mahalanobis',
 'minkowski',
 'rogerstanimoto',
 'russellrao',
 'seuclidean',
 'sokalmichener',
 'sokalsneath',
 'sqeuclidean',
 'yule']

In [8]:
metrics = [
    'braycurtis',
    'canberra',
    'chebyshev',
    'cityblock',
    'correlation',
    'cosine',
    'dice',
    'euclidean',
    'hamming',
    'jaccard',
    #'jensenshannon',
    #'kulczynski1',
    # 'mahalanobis',
    'minkowski',
    'rogerstanimoto',
    'russellrao',
    #'seuclidean',
    'sokalmichener',
    'sokalsneath',
    'sqeuclidean',
    #'yule',
]

In [9]:
param_grid= {
    'n_neighbors': range(1,10),
    'metric': metrics,
}

In [10]:
# запускаем поиск хороших сочетаний гиперпараметров модели
grid = GridSearchCV(
        estimator=KNeighborsClassifier(),
        param_grid=param_grid,
    ).fit(X_train,y_train)

In [11]:
import pandas as pd
display( pd.DataFrame(grid.cv_results_) )

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000556,0.000075,0.001859,0.000157,braycurtis,1,"{'metric': 'braycurtis', 'n_neighbors': 1}",0.833333,0.833333,0.888889,0.722222,0.823529,0.820261,0.054174,12
1,0.000513,0.000034,0.001898,0.000130,braycurtis,2,"{'metric': 'braycurtis', 'n_neighbors': 2}",0.666667,0.833333,0.833333,0.611111,0.764706,0.741830,0.089449,63
2,0.000490,0.000021,0.001839,0.000095,braycurtis,3,"{'metric': 'braycurtis', 'n_neighbors': 3}",0.722222,0.833333,0.833333,0.722222,0.705882,0.763399,0.057412,55
3,0.000501,0.000030,0.001830,0.000068,braycurtis,4,"{'metric': 'braycurtis', 'n_neighbors': 4}",0.666667,0.833333,0.888889,0.666667,0.823529,0.775817,0.091867,32
4,0.000485,0.000010,0.001811,0.000070,braycurtis,5,"{'metric': 'braycurtis', 'n_neighbors': 5}",0.611111,0.888889,0.888889,0.722222,0.764706,0.775163,0.105539,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,0.000354,0.000005,0.001739,0.000282,sqeuclidean,5,"{'metric': 'sqeuclidean', 'n_neighbors': 5}",0.666667,0.833333,0.777778,0.666667,0.764706,0.741830,0.065555,65
140,0.000374,0.000042,0.001977,0.000778,sqeuclidean,6,"{'metric': 'sqeuclidean', 'n_neighbors': 6}",0.666667,0.888889,0.833333,0.722222,0.764706,0.775163,0.078741,33
141,0.000363,0.000019,0.001807,0.000465,sqeuclidean,7,"{'metric': 'sqeuclidean', 'n_neighbors': 7}",0.666667,0.888889,0.833333,0.722222,0.764706,0.775163,0.078741,33
142,0.000358,0.000011,0.002168,0.000784,sqeuclidean,8,"{'metric': 'sqeuclidean', 'n_neighbors': 8}",0.666667,0.833333,0.833333,0.777778,0.764706,0.775163,0.061082,33


In [12]:
display( grid.best_score_ )
display( grid.best_params_ )

0.977124183006536

{'metric': 'canberra', 'n_neighbors': 3}

In [13]:
model_ = grid.best_estimator_

In [14]:
# проверяем результат (сравните с результатом выше, где параметры по умолчанию)
predicted_ = model_.predict(X_test)

# таблица метрик качества классификации на тестовом наборе
print( classification_report(y_test,predicted_) )

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        30
           1       1.00      0.94      0.97        32
           2       1.00      1.00      1.00        27

    accuracy                           0.98        89
   macro avg       0.98      0.98      0.98        89
weighted avg       0.98      0.98      0.98        89

