In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()


In [6]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [7]:
cancer.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [8]:
cancer.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [9]:
X = cancer.data
y = cancer.target

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [11]:
## model

model = KNeighborsClassifier(n_neighbors=5)

In [12]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model , X_train , y_train)
scores

array([0.94174757, 0.90291262, 0.94117647, 0.90196078, 0.99019608])

In [13]:
scores.mean()

0.9355987055016183

In [14]:
scores = cross_val_score(model , X_train , y_train,cv=3)
scores

array([0.94736842, 0.91812865, 0.92352941])

In [15]:
scores.mean()

0.9296754959293659

-----

In [16]:
from sklearn.model_selection import cross_validate

In [18]:
scores = cross_validate(model , X_train , y_train,cv=3,scoring=['accuracy','average_precision'])
scores

{'fit_time': array([0.00053406, 0.0002811 , 0.000139  ]),
 'score_time': array([0.01371002, 0.01973104, 0.00499201]),
 'test_accuracy': array([0.94736842, 0.91812865, 0.92352941]),
 'test_average_precision': array([0.95969421, 0.95476734, 0.97859553])}

In [19]:
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_accuracy,test_average_precision
0,0.000534,0.01371,0.947368,0.959694
1,0.000281,0.019731,0.918129,0.954767
2,0.000139,0.004992,0.923529,0.978596


----------

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {'n_neighbors':[1,3,5,7,9]}

In [22]:
grid_search = GridSearchCV(KNeighborsClassifier() , param_grid , cv=5)

In [23]:
grid_search.fit(X_train , y_train)

In [24]:
grid_search.score(X_test,y_test)

0.8771929824561403

In [25]:
grid_search.best_params_

{'n_neighbors': 3}

In [26]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000348,9.9e-05,0.007406,0.004457,1,{'n_neighbors': 1},0.902913,0.902913,0.931373,0.892157,0.970588,0.919989,0.028452,5
1,0.000144,1e-05,0.004623,0.00337,3,{'n_neighbors': 3},0.92233,0.92233,0.931373,0.911765,0.990196,0.935599,0.027996,1
2,0.000129,2.4e-05,0.002763,0.000239,5,{'n_neighbors': 5},0.941748,0.902913,0.941176,0.901961,0.990196,0.935599,0.032403,1
3,0.000147,1.7e-05,0.004343,0.0034,7,{'n_neighbors': 7},0.941748,0.902913,0.941176,0.892157,0.990196,0.933638,0.034601,4
4,0.000124,1.8e-05,0.002391,8.9e-05,9,{'n_neighbors': 9},0.951456,0.902913,0.941176,0.892157,0.990196,0.93558,0.035268,3
