# S04L11 - Intro Model Selection

- Instructor: Dalcimar Casanova (dalcimar@gmail.com)
- Course website: https://www.dalcimar.com/disciplinas/aprendizado-de-maquina
- Bibliography: based on lectures of Dr. Sebastian Raschka
- Course website: http://pages.stat.wisc.edu/~sraschka/teaching/

### TAREFA L11
- Pedro Gomes
- CD.2021-1.M4.IRP

## Scikit-learn pipelines
- Scikit-learn pipelines are an extremely convenient and powerful concept -- one of the things that sets scikit-learn apart from other machine learning libraries.
- Pipelines basically let us define a series of perprocessing steps together with fitting an estimator.
- Pipelines will automatically take care of pitfalls like estimating feature scaling parameters from the training set and applying those to scale new data (which we discussed earlier in the context of z-score standardization).
- Below is an visualization of how pipelines work.

<img src="https://github.com/rasbt/stat451-machine-learning-fs20/raw/ee813e1c30a5610a2e6475a77c67c1174a63b75c/L05/code/images/sklearn-pipeline.png" width="400">

Below is an example pipeline that combines the feature scaling step, PCA and the kNN classifier.

In [1]:
import numpy as np

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'numpy'

#### - carregamento dos dados

In [None]:
#carrega o pandas e le os dados do csv para um pandas.dataframe
import pandas as pd

df = pd.read_csv('./data/breast_cancer.csv')  # --> https://edisciplinas.usp.br/mod/resource/view.php?id=2173861
df.head()

Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [None]:
#removida coluna sample_id
del df['sample_id']
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [None]:
#identificando os valores não numericos da coluna diagnosis
df['diagnosis'].unique()

array(['malignant', 'benign'], dtype=object)

In [None]:
#criando o dicionario e fazendo o map para transformar os valores acima em inteiros
d = {
    'malignant' : 0,
    'benign' : 1
    }

df['diagnosis'] = df['diagnosis'].map(d)
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
#carregando a coluna diagnosis para o eixo Y
y = df['diagnosis'].values
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [None]:
#carregando as demais colunas, exceto a sample_id (removida) e diagnosis(30)
X = df.iloc[:, 0:29].values
X

array([[ 17.99  ,  10.38  , 122.8   , ...,   0.7119,   0.2654,   0.4601],
       [ 20.57  ,  17.77  , 132.9   , ...,   0.2416,   0.186 ,   0.275 ],
       [ 19.69  ,  21.25  , 130.    , ...,   0.4504,   0.243 ,   0.3613],
       ...,
       [ 16.6   ,  28.08  , 108.3   , ...,   0.3403,   0.1418,   0.2218],
       [ 20.6   ,  29.33  , 140.1   , ...,   0.9387,   0.265 ,   0.4087],
       [  7.76  ,  24.54  ,  47.92  , ...,   0.    ,   0.    ,   0.2871]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1, stratify=y)

In [None]:
pipe = Pipeline([
        ('z-score', StandardScaler()),
        ('reduce_dim', PCA(n_components=3)),
        ('classify', KNeighborsClassifier(n_neighbors=2))])

In [None]:
pipe.fit(X_train, y_train) #lado preto

Pipeline(steps=[('z-score', StandardScaler()),
                ('reduce_dim', PCA(n_components=3)),
                ('classify', KNeighborsClassifier(n_neighbors=2))])

In [None]:
from sklearn.metrics import accuracy_score

y_train_pred = pipe.predict(X_train) #lado branco
accuracy_score(y_train, y_train_pred)

0.9758241758241758

In [None]:
from sklearn.metrics import accuracy_score

y_test_pred = pipe.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9210526315789473

As you can see above, the Pipeline itself follows the scikit-learn estimator API.

## Scikit-learn grid-search

- In machine learning practice, we often need to experiment with an machine learning algorithm's hyperparameters to find a good setting.
- The process of tuning hyperparameters and comparing and selecting the resulting models is also called "model selection" (in contrast to "algorithm selection").
- We will cover topics such as "model selection" and "algorithm selection" in more detail later in this course.
- For now, we are introducing the simplest way of performing model selection: using the "holdout method."
- In the holdout method, we split a dataset into 3 subsets: a training, a validation, and a test datatset.
- To avoid biasing the estimate of the generalization performance, we only want to use the test dataset once, which is why we use the validation dataset for hyperparameter tuning (model selection).
- Here, the validation dataset serves as an estimate of the generalization performance, too, but it becomes more biased than the final estimate on the test data because of its repeated re-use during model selection (think of "multiple hypothesis testing").

<img src="https://github.com/rasbt/stat451-machine-learning-fs20/raw/ee813e1c30a5610a2e6475a77c67c1174a63b75c/L05/code/images/holdout-tuning.png" width="400">

In [None]:
param_grid = {
    'reduce_dim__n_components': [1, 2, 3, 4],
    'classify__n_neighbors': [1, 2, 3, 4, 5]
}

grid = GridSearchCV(pipe, cv=2, n_jobs=1, param_grid=param_grid, scoring='accuracy')

In [None]:
grid.fit(X_train, y_train) #treina, avalia e muda parametro

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('z-score', StandardScaler()),
                                       ('reduce_dim', PCA(n_components=3)),
                                       ('classify',
                                        KNeighborsClassifier(n_neighbors=2))]),
             n_jobs=1,
             param_grid={'classify__n_neighbors': [1, 2, 3, 4, 5],
                         'reduce_dim__n_components': [1, 2, 3, 4]},
             scoring='accuracy')

In [None]:
print(grid.cv_results_)

{'mean_fit_time': array([0.00347364, 0.00199556, 0.00248981, 0.00199735, 0.00149691,
       0.00099707, 0.00199139, 0.00150979, 0.00199449, 0.0024997 ,
       0.00201213, 0.00297439, 0.00199473, 0.00199413, 0.00149655,
       0.00149655, 0.00200498, 0.001495  , 0.00249672, 0.0029757 ]), 'std_fit_time': array([5.15818596e-04, 0.00000000e+00, 4.94718552e-04, 3.57627869e-07,
       5.00559807e-04, 7.15255737e-07, 4.41074371e-06, 5.07235527e-04,
       3.57627869e-07, 4.72426414e-04, 1.72853470e-05, 9.81450081e-04,
       1.07288361e-06, 9.53674316e-07, 4.98294830e-04, 4.99010086e-04,
       9.41753387e-06, 4.99606133e-04, 4.68492508e-04, 9.81807709e-04]), 'mean_score_time': array([0.01160467, 0.00849092, 0.00698042, 0.00798774, 0.00699067,
       0.00796235, 0.00648522, 0.00696623, 0.00648367, 0.00698662,
       0.00747895, 0.00749373, 0.00746667, 0.00847709, 0.00748014,
       0.00798059, 0.00647604, 0.00696278, 0.00899243, 0.00797629]), 'std_score_time': array([0.00063479, 0.00247896, 0

In [None]:
grid.cv_results_['mean_test_score']

array([0.87480679, 0.91874372, 0.93626826, 0.92966999, 0.85939794,
       0.91871474, 0.92747701, 0.92966033, 0.89236031, 0.92750599,
       0.9208884 , 0.94066388, 0.89676559, 0.92967965, 0.92744803,
       0.93626826, 0.89898756, 0.93628758, 0.92743836, 0.94944547])

In [None]:
# melhor combinacao
print(grid.best_score_)
print(grid.best_params_)

0.9494454749207821
{'classify__n_neighbors': 5, 'reduce_dim__n_components': 4}


In [None]:
clf = grid.best_estimator_ #novo classificador, baseado no melhor valor

In [None]:
y_test_pred = clf.predict(X_test) #ou grid.predict -> que escolheria o melhor
accuracy_score(y_test, y_test_pred)

0.9649122807017544