# 0.3.7 Búsqueda Exhaustiva de hiperparámetros usando GridSearchCV

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import math
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../../../../") 

import utils.paths as path
from utils.paths2 import direcciones

En muchos casos, los modelos contienen diferentes hiperparámetros que controlan su configuración y la estimación de los parámetros. Por ejemplo, en el ejemplo del ajuste del polinomio, el grado n es un hiperparámetro. En este tutorial, se presenta como abordar el problema cuando hay más de un hiperparámetro que debe ser ajustado.

## 0.3.7.1 Parametrización de la búsqueda

In [4]:
#
# Aca se usara una SVM. Dependiendo del tipo de kernel cambian los parámetros
# que pueden ajustarse.
#
# La variable tuned_parameters es una lista de diccionarios que contiene los
# valores que pueden ajustarse.
#
param_grid = [
    # -------------------------------------------------------------------------
    # Primera malla de parámetros
    {
        "kernel": ["rbf"],
        "gamma": [1e-3, 1e-4],
        "C": [1, 10, 100, 1000],
    },
    # -------------------------------------------------------------------------
    # Segunda malla de parámetros
    {
        "kernel": ["linear"],
        "C": [1, 10, 100, 1000],
    },
]

gridSearchCV = GridSearchCV(
    # --------------------------------------------------------------------------
    # This is assumed to implement the scikit-learn estimator interface.
    estimator=SVC(),
    # --------------------------------------------------------------------------
    # Dictionary with parameters names (str) as keys and lists of parameter
    # settings to try as values, or a list of such dictionaries
    param_grid=param_grid,
    # --------------------------------------------------------------------------
    # Determines the cross-validation splitting strategy.
    cv=5,
    # --------------------------------------------------------------------------
    # Strategy to evaluate the performance of the cross-validated model on the
    # test set.
    scoring="accuracy",
    # --------------------------------------------------------------------------
    # Refit an estimator using the best found parameters on the whole dataset.
    refit=True,
    # --------------------------------------------------------------------------
    # If False, the cv_results_ attribute will not include training scores.
    return_train_score=False,
)

## 0.3.7.2 Preparación de los datos

In [5]:
digits = load_digits()

n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.5,
    random_state=0,
)

## 0.3.7.3 Realización de la búsqueda

In [6]:
gridSearchCV.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
             scoring='accuracy')

## 0.3.7.4 Valores retornados

In [7]:
gridSearchCV.cv_results_

{'mean_fit_time': array([0.03260212, 0.02819662, 0.03659439, 0.01899881, 0.03699703,
        0.01820006, 0.03539886, 0.02020016, 0.02259622, 0.01800022,
        0.01479969, 0.01339917]),
 'std_fit_time': array([0.00049096, 0.00193209, 0.00136052, 0.00062775, 0.0026168 ,
        0.00039988, 0.00049175, 0.00171792, 0.00972897, 0.00141516,
        0.00271499, 0.0007991 ]),
 'mean_score_time': array([0.0140008 , 0.0168005 , 0.01620517, 0.0109971 , 0.01460085,
        0.0105969 , 0.01420317, 0.01099911, 0.00459948, 0.00420113,
        0.00379887, 0.00319982]),
 'std_score_time': array([1.09684607e-03, 1.16693217e-03, 1.47130601e-03, 1.32096803e-05,
        7.92412325e-04, 4.87455554e-04, 3.98445158e-04, 8.98752494e-04,
        8.01577866e-04, 3.99020952e-04, 1.16890759e-03, 4.01020759e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 100, 100, 1000, 1000, 1, 10, 100, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False,

In [8]:
#
# Estimator that was chosen by the search, i.e. estimator which gave highest
# score (or smallest loss if specified) on the left out data.
#
gridSearchCV.best_estimator_

SVC(C=10, gamma=0.001)

In [9]:
gridSearchCV.best_score_

0.9866480446927375

In [10]:
gridSearchCV.best_params_

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

## 0.3.7.5 Pronóstico con el mejor modelo

In [11]:
gridSearchCV.predict(X_train)

array([1, 4, 9, 0, 4, 1, 1, 5, 9, 1, 4, 2, 6, 3, 9, 7, 6, 4, 8, 6, 8, 7,
       6, 0, 5, 9, 4, 7, 3, 4, 9, 4, 9, 7, 9, 1, 5, 6, 0, 0, 4, 3, 6, 1,
       0, 9, 4, 8, 7, 5, 9, 8, 4, 5, 0, 1, 6, 0, 5, 5, 0, 4, 3, 2, 8, 7,
       6, 3, 4, 2, 5, 8, 0, 6, 9, 4, 5, 4, 9, 7, 3, 3, 1, 4, 4, 2, 6, 8,
       1, 1, 0, 3, 7, 4, 6, 7, 4, 0, 5, 2, 9, 2, 1, 9, 2, 3, 1, 7, 7, 4,
       5, 6, 5, 6, 7, 8, 1, 4, 3, 4, 4, 3, 5, 3, 3, 4, 7, 9, 8, 0, 6, 1,
       9, 0, 8, 4, 1, 2, 3, 9, 7, 8, 8, 8, 3, 7, 5, 7, 0, 1, 7, 8, 3, 8,
       0, 4, 8, 6, 2, 3, 6, 7, 3, 7, 7, 1, 3, 5, 0, 9, 8, 5, 3, 1, 2, 0,
       3, 6, 0, 3, 4, 1, 2, 3, 1, 0, 5, 8, 9, 3, 9, 6, 6, 8, 9, 0, 7, 8,
       2, 0, 0, 7, 7, 4, 5, 3, 1, 8, 5, 9, 6, 2, 9, 7, 7, 9, 5, 4, 2, 6,
       6, 1, 3, 4, 7, 2, 8, 0, 6, 1, 6, 6, 5, 8, 4, 3, 0, 5, 2, 9, 9, 7,
       8, 0, 5, 0, 6, 3, 3, 5, 1, 5, 1, 7, 9, 6, 4, 5, 0, 1, 8, 7, 8, 8,
       8, 9, 8, 7, 7, 2, 2, 2, 8, 0, 7, 8, 6, 8, 0, 4, 2, 2, 3, 7, 9, 0,
       2, 0, 0, 2, 7, 1, 5, 6, 4, 0, 0, 5, 5, 3, 9,

In [12]:
print('ok_')

ok_
