# L11 - Introdução a seleção de modelos

Importando as bibliotecas

In [112]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Atribuindo a um DataFrame os dados do arquivo .csv

In [113]:
df = pd.read_csv('https://raw.githubusercontent.com/jefferson-oliva/databases/main/breast_cancer.csv')
df.head()

Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


Verificando a quantidade de linhas e colunas do DataFrame

In [114]:
df.shape

(569, 32)

Verificando quais os valores distintos da coluna "diagnosis"

In [115]:
df['diagnosis'].unique()

array(['malignant', 'benign'], dtype=object)

Criando um dicionário onde são armazenados valores númericos para cada valor distinto da coluna "diagnosis" e substituindo os dados da coluna original do DataFrame pelos valores armazenados no dicionário

In [116]:
d = {'benign': 0, 
     'malignant': 1}

df['diagnosis'] = df['diagnosis'].map(d)
df.head()

Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


Atribuindo os valores e labels as varáveis X e y

In [117]:
X = df.iloc[:, 1:31].values
y = df['diagnosis'].values

Armazenando em vaviáveis os dados de treino e teste

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1, stratify=y)

Criando a pipeline

In [119]:
pipe = Pipeline([
        ('z-score', StandardScaler()),
        ('reduce_dim', PCA(n_components=5)),
        ('classify', KNeighborsClassifier(n_neighbors=3))])

Realizando o treinamento

In [120]:
pipe.fit(X_train, y_train)

Verificando a acurácia dos dados de treino

In [121]:
from sklearn.metrics import accuracy_score

y_train_pred = pipe.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.9802197802197802

Verificando a acurácia dos dados de teste

In [122]:
from sklearn.metrics import accuracy_score

y_test_pred = pipe.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9473684210526315

## Scikit-learn grid-search

Atribundo os parâmetros ao Gridsearch

In [123]:
n_components = np.arange(1, 31)
n_neighbors = np.arange(1, 5)

param_grid = {
    'reduce_dim__n_components': n_components,
    'classify__n_neighbors': n_neighbors
}

grid = GridSearchCV(pipe, cv=2, n_jobs=1, param_grid=param_grid, scoring='accuracy')

Realizando o treinamento

In [124]:
grid.fit(X_train, y_train)

Exibindo os resultados

In [125]:
print(grid.cv_results_)

{'mean_fit_time': array([0.00200248, 0.00199485, 0.00149262, 0.00199449, 0.00199497,
       0.00199294, 0.00199401, 0.00249374, 0.00149632, 0.00199461,
       0.00201249, 0.00249124, 0.0029918 , 0.00299144, 0.00249374,
       0.0014962 , 0.00199521, 0.00249326, 0.00299168, 0.00299108,
       0.00249445, 0.00249279, 0.00199473, 0.00199544, 0.0022465 ,
       0.00249326, 0.0019958 , 0.00199485, 0.00249553, 0.00199521,
       0.00299156, 0.00199461, 0.00197864, 0.00201118, 0.00149632,
       0.00199234, 0.00149345, 0.00300813, 0.00149643, 0.00299358,
       0.00249255, 0.00199342, 0.00275505, 0.00199306, 0.00200152,
       0.00149453, 0.00199604, 0.0024935 , 0.00199521, 0.00249588,
       0.00199473, 0.00149584, 0.00349116, 0.00199461, 0.00299191,
       0.00249374, 0.00249517, 0.00249422, 0.00149584, 0.00149632,
       0.00149632, 0.00199497, 0.00201309, 0.00278461, 0.00251043,
       0.00201452, 0.00200081, 0.00199509, 0.00250697, 0.00199914,
       0.0029912 , 0.00201321, 0.00199556, 0

Exibindo as médias do conjunto de dados de teste

In [126]:
grid.cv_results_['mean_test_score']

array([0.8769418 , 0.89234098, 0.91869542, 0.93408494, 0.94509815,
       0.94506917, 0.94507883, 0.94289551, 0.94947446, 0.95165778,
       0.9494648 , 0.94506917, 0.94726215, 0.94725249, 0.95165778,
       0.95165778, 0.94945514, 0.94945514, 0.94945514, 0.94945514,
       0.94945514, 0.94945514, 0.94945514, 0.94945514, 0.94945514,
       0.94945514, 0.94945514, 0.94945514, 0.94945514, 0.94945514,
       0.85275137, 0.89670763, 0.92967965, 0.92966999, 0.94726215,
       0.9384709 , 0.93407528, 0.94066388, 0.94286653, 0.94726215,
       0.95166744, 0.9494648 , 0.9494648 , 0.94727181, 0.9494648 ,
       0.94727181, 0.9494648 , 0.9494648 , 0.9494648 , 0.94727181,
       0.94727181, 0.94727181, 0.94727181, 0.94727181, 0.94727181,
       0.94727181, 0.94727181, 0.94727181, 0.94727181, 0.94727181,
       0.88354007, 0.91648311, 0.93849988, 0.94289551, 0.96045869,
       0.94505951, 0.94726215, 0.95386042, 0.95387008, 0.95606307,
       0.95606307, 0.95166744, 0.95824639, 0.95166744, 0.95387

Exibindo os melhores parâmetros

In [127]:
print(grid.best_score_)
print(grid.best_params_)

0.9604586907798129
{'classify__n_neighbors': 3, 'reduce_dim__n_components': 5}


Realizando o cálculo com base na melhor estimativa

In [128]:
clf = grid.best_estimator_

Verificando a acurácia dos dados de teste

In [129]:
y_test_pred = clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9473684210526315