In [1]:
# Hyper-parameter tuning
# Dataset: wisc_bc_data.csv
# Location: https://mitu.co.in/dataset

In [2]:
import pandas as pd

### Import the dataset

In [4]:
df = pd.read_csv('wisc_bc_data.csv')

In [5]:
df.shape

(569, 32)

In [6]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

### Input and output data

In [8]:
x = df.drop(['id','diagnosis'], axis = 1)
y = df['diagnosis']

### Feature scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

### Cross Validation

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, y, random_state=0, test_size=0.25)

In [15]:
x_train.shape

(426, 30)

In [16]:
x_test.shape

(143, 30)

### Create KNN Model

In [17]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [18]:
y_pred = clf.predict(x_test)

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.951048951048951

### Create the parameter grid

In [20]:
params = {
    'n_neighbors': [5,7,10,12,15,18,20],
    'weights': ['uniform','distance'],
    'algorithm': ['auto','kd_tree','ball_tree','brute']
}

### Import the Grid search algorithm

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
grid = GridSearchCV(estimator= KNeighborsClassifier(),
                   param_grid=params,
                   scoring='accuracy',
                   cv = 5)

In [25]:
grid.fit(x_scaled,y)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'kd_tree', 'ball_tree', 'brute'],
                         'n_neighbors': [5, 7, 10, 12, 15, 18, 20],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [26]:
grid.best_estimator_

KNeighborsClassifier(n_neighbors=10, weights='distance')

In [27]:
y_pred = grid.predict(x_test)

In [28]:
accuracy_score(y_test, y_pred)

1.0

In [29]:
grid.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'kd_tree', 'ball_tree', 'brute'],
                         'n_neighbors': [5, 7, 10, 12, 15, 18, 20],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [30]:
grid.best_estimator_

KNeighborsClassifier(n_neighbors=10)

In [31]:
y_pred = grid.predict(x_test)
accuracy_score(y_test, y_pred)

0.972027972027972

### Import the Randomized search algorithm

In [33]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
randcv = RandomizedSearchCV(estimator= KNeighborsClassifier(),
                   param_distributions=params,
                   scoring='accuracy',
                   cv = 5)

In [36]:
randcv.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(),
                   param_distributions={'algorithm': ['auto', 'kd_tree',
                                                      'ball_tree', 'brute'],
                                        'n_neighbors': [5, 7, 10, 12, 15, 18,
                                                        20],
                                        'weights': ['uniform', 'distance']},
                   scoring='accuracy')

In [37]:
randcv.best_estimator_

KNeighborsClassifier(algorithm='brute')

In [38]:
y_pred = randcv.predict(x_test)
accuracy_score(y_test, y_pred)

0.972027972027972