# Sklearn

## sklearn.grid_search

документация: http://scikit-learn.org/stable/modules/grid_search.html

In [1]:
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn import model_selection

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

### Генерация датасета

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3,random_state = 0)

### Задание модели

In [4]:
classifier = linear_model.SGDClassifier(random_state = 0)

### Генерация сетки

In [5]:
classifier.get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [6]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'n_iter' : range(5,10),
    'alpha' : np.linspace(0.0001, 0.001, num = 5),
}

In [7]:
cv = model_selection.StratifiedShuffleSplit(n_splits=10, test_size = 0.2, random_state = 0)

### Подбор параметров и оценка качества

#### Grid search

In [8]:
grid_cv = GridSearchCV(classifier, parameters_grid, scoring = 'accuracy')

In [9]:
train_data[0:5]

array([[5. , 2. , 3.5, 1. ],
       [6.5, 3. , 5.5, 1.8],
       [6.7, 3.3, 5.7, 2.5],
       [6. , 2.2, 5. , 1.5],
       [6.7, 2.5, 5.8, 1.8]])

In [10]:
train_labels

array([1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1,
       2, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0,
       2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1,
       1, 0, 1, 2, 2, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [11]:
%%time
grid_cv.fit(train_data, train_labels)

Wall time: 1.81 s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'penalty': ['l1', 'l2'], 'n_iter': range(5, 10), 'alpha': array([0.0001 , 0.00032, 0.00055, 0.00078, 0.001  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [12]:
grid_cv.best_estimator_

SGDClassifier(alpha=0.001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=8, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [13]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

0.9619047619047619
{'alpha': 0.001, 'loss': 'hinge', 'n_iter': 8, 'penalty': 'l1'}


In [14]:
grid_cv.cv_results_

{'mean_fit_time': array([0.00266997, 0.0026594 , 0.00199127, 0.00133014, 0.00101399,
        0.        , 0.00520833, 0.        , 0.00520619, 0.        ,
        0.        , 0.00520817, 0.00317065, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.0052073 , 0.        , 0.00520698, 0.        , 0.00520794,
        0.00216929, 0.        , 0.        , 0.00520905, 0.00520714,
        0.        , 0.00520698, 0.        , 0.        , 0.00520786,
        0.        , 0.        , 0.        , 0.00520762, 0.        ,
        0.        , 0.00521008, 0.        , 0.        , 0.0052069 ,
        0.        , 0.        , 0.        , 0.        , 0.00520809,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00033228, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.00133522, 0.        ,
        0.00520976, 0.        , 0.00520738, 0.        , 0.        ,
        0.        , 0.0052069 ,

#### Randomized grid search

In [15]:
randomized_grid_cv = RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv, n_iter = 20, 
                                                   random_state = 0)

In [16]:
%%time
randomized_grid_cv.fit(train_data, train_labels)

Wall time: 586 ms


RandomizedSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
          error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=20, n_jobs=None,
          param_distributions={'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'penalty': ['l1', 'l2'], 'n_iter': range(5, 10), 'alpha': array([0.0001 , 0.00032, 0.00055, 0.00078, 0.001  ])},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [17]:
print (randomized_grid_cv.best_score_)
print (randomized_grid_cv.best_params_)

0.8666666666666667
{'penalty': 'l1', 'n_iter': 9, 'loss': 'log', 'alpha': 0.00055}


## Мнение поступившего на курс
Поиск по сетке позволяет обучить алгоритм несколько раз на разных параметрах, предложенных нами, и показать алгоритм, который даёт наилучшее качество. Об оптимизации речи не идёт) Такой поиск не очень быстрый, так как он перебирает все возможные значения. Чтобы уменьшить время поиска лучшего алгоритма, используют случайный поиск по сетке, который перебирает не все варианты ответов.