In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix

In [2]:
# Leemos los datos y los cargamos a un DataFrame de pandas
data = pd.read_csv('breast_cancer.csv',header=None)
data.columns = ['Sample Code Number','Clump Thickness','Uniformity of Cell Size',
                'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',
                'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
data.head(5)

Unnamed: 0,Sample Code Number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# Aplicamos un poco de pre-procesamiento a los datos
data = data.drop(['Sample Code Number'],axis=1) #Eliminamos la 1era columna
data = data[data['Bare Nuclei'] != '?'] #Removemos filas sin datos
data['Class'] = np.where(data['Class'] == 2,0,1) #Cambiamos la representacion de las clases
data['Class'].value_counts() #Distribucion de las slases

0    444
1    239
Name: Class, dtype: int64

In [4]:
# Separamos features de target
X = data.drop(['Class'],axis=1)
y = data['Class']

# Dividimos los datos en Train y Test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Definimos el modelo y sus parámetros
knn = KNeighborsClassifier()

# Grilla para Grid Search
param_grid = {'n_neighbors':np.arange(1, 20),
              'leaf_size':[1,3,5,7,10],
              'algorithm':['auto', 'kd_tree']}

# Grilla para Random Search
param_dist = {'n_neighbors':sp.stats.randint(1, 20),
              'leaf_size':sp.stats.randint(1, 10),
              'algorithm':['auto', 'kd_tree']}

In [6]:
# CASO 1: Grid Search
model = GridSearchCV(knn, param_grid=param_grid, cv=5)

# CASO 2: Random Search
#model = RandomizedSearchCV(knn, param_dist, random_state=0, cv=5)

# Entrenamos: KNN con la grilla definida arriba y CV con tamaño de Fold=5
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['auto', 'kd_tree'],
                         'leaf_size': [1, 3, 5, 7, 10],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [7]:
print("Mejores parametros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

scores = pd.DataFrame(model.cv_results_)
scores

Mejores parametros: {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 16}
Mejor Score: 0.97265625



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004575,0.000856,0.011300,0.000735,auto,1,1,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.971154,0.950980,0.960784,0.960784,0.931373,0.955078,0.013445,171
1,0.004065,0.000082,0.011848,0.000511,auto,1,2,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.961538,0.931373,0.931373,0.960784,0.931373,0.943359,0.014611,185
2,0.004264,0.000179,0.012977,0.000649,auto,1,3,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.961538,0.960784,0.980392,0.970588,0.950980,0.964844,0.009926,111
3,0.004069,0.000085,0.012323,0.000671,auto,1,4,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.961538,0.960784,0.970588,0.980392,0.950980,0.964844,0.009926,111
4,0.004043,0.000069,0.012961,0.001139,auto,1,5,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.971154,0.950980,0.970588,0.990196,0.950980,0.966797,0.014679,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,0.004034,0.000134,0.011543,0.000236,kd_tree,10,15,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.980769,0.960784,0.980392,0.990196,0.950980,0.972656,0.014431,1
186,0.003961,0.000047,0.010753,0.000341,kd_tree,10,16,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.980769,0.960784,0.980392,0.990196,0.950980,0.972656,0.014431,1
187,0.003932,0.000137,0.011179,0.001162,kd_tree,10,17,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.980769,0.960784,0.980392,0.990196,0.950980,0.972656,0.014431,1
188,0.004719,0.000767,0.011366,0.000442,kd_tree,10,18,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n...",0.980769,0.950980,0.970588,0.990196,0.950980,0.968750,0.015731,65


In [8]:
#Predecimos en los datos de test
prediction = model.predict(X_test)

In [9]:
# Matriz de Confusion
cm = confusion_matrix(y_test,prediction)
print(cm)

# Reporte de Clasificacion
report = classification_report(y_test, prediction)
print(report)

[[102   1]
 [  8  60]]
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       103
           1       0.98      0.88      0.93        68

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

