In [1]:
# data: wisc_bs_data.csv

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('wisc_bc_data.csv')
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [4]:
df.shape

(569, 32)

In [6]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [9]:
# separate input and output

x = df.drop(['id', 'diagnosis'], axis = 1)

y = df['diagnosis']

#### feature scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler
sca = MinMaxScaler()
x_scaled = sca.fit_transform(x)

#### Cross Validation

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y,
                                                    random_state=0,
                                                    test_size=0.25)

#### create the KNN model

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors= 20)
knn.fit(x_train, y_train)

In [14]:
y_pred = knn.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.951048951048951

### create the parameter grid

In [16]:
params = {
    'n_neighbors': [5, 7, 10, 12, 14, 15, 17],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

### Grid Search

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
grid = GridSearchCV(estimator= KNeighborsClassifier(),
                   param_grid= params,
                   scoring= 'accuracy',
                   cv= 5)

In [19]:
grid.fit(x_scaled, y)

In [20]:
grid.best_estimator_

In [21]:
y_pred = grid.predict(x_test)

In [22]:
accuracy_score(y_test, y_pred)

1.0

In [23]:
grid.fit(x_train, y_train)

In [24]:
y_pred = grid.predict(x_test)
accuracy_score(y_test, y_pred)

0.972027972027972

### Randomized grid search

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
grid = RandomizedSearchCV(estimator= KNeighborsClassifier(),
                         param_distributions= params,
                         scoring= 'accuracy',
                         cv = 5,
                         n_iter= 10)

In [28]:
grid.fit(x_train, y_train)

In [29]:
y_pred = grid.predict(x_test)

In [30]:
accuracy_score(y_test, y_pred)

0.972027972027972

In [31]:
grid.best_estimator_

### Save the object

In [32]:
# serialize the ML object

In [33]:
import joblib

In [34]:
joblib.dump(grid, 'randomGrid.model')

['randomGrid.model']