# import libraries and Load Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv('breast-cancer.csv')
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [3]:
data.drop('id', axis=1, inplace=True)

In [4]:
data.isnull().sum()

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [6]:
y = data['diagnosis']
X = data.iloc[:,1:]

# Modeling with orginal data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=30)

In [8]:
knn = KNeighborsClassifier()
neighbors = range(3,17,2)
params = {
    'n_neighbors' : neighbors,
    'weights' : ['uniform','distance']
}

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=30)
knn_grid = GridSearchCV(knn, param_grid=params, scoring='accuracy', cv=cv, verbose=1, return_train_score=True)

In [9]:
knn_grid.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


In [10]:
knn_grid.best_params_

{'n_neighbors': 9, 'weights': 'uniform'}

In [11]:
knn_grid.score(X_train, y_train)

0.9472361809045227

In [12]:
knn_grid.score(X_test, y_test)

0.9239766081871345

In [13]:
y_pred = knn_grid.predict(X_test)
print(f'\nconfusion_matrix:\n{metrics.confusion_matrix(y_pred, y_test)}')
print(f'\nclassification_report:\n{metrics.classification_report(y_pred, y_test)}')


confusion_matrix:
[[106  10]
 [  3  52]]

classification_report:
              precision    recall  f1-score   support

           B       0.97      0.91      0.94       116
           M       0.84      0.95      0.89        55

    accuracy                           0.92       171
   macro avg       0.91      0.93      0.92       171
weighted avg       0.93      0.92      0.93       171



In [14]:
dt = DecisionTreeClassifier()
depth = range(3,10)
features = range(1,5)

params = {
    'criterion' : ['gini','entropy'],
    'max_depth' : depth,
    'max_features' : features,
    'max_leaf_nodes' : range(8,15),
    'random_state' : [30]
}

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=30)
dt_grid = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=cv, verbose=1, return_train_score=True)

In [15]:
dt_grid.fit(X_train, y_train)

Fitting 5 folds for each of 392 candidates, totalling 1960 fits


In [16]:
dt_grid.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 4,
 'max_leaf_nodes': 12,
 'random_state': 30}

In [17]:
dt_grid.score(X_train, y_train)

0.9773869346733668

In [18]:
dt_grid.score(X_test, y_test)

0.9298245614035088

In [19]:
y_pred = dt_grid.predict(X_test)
print(f'\nconfusion_matrix:\n{metrics.confusion_matrix(y_pred, y_test)}')
print(f'\nclassification_report:\n{metrics.classification_report(y_pred, y_test)}')


confusion_matrix:
[[106   9]
 [  3  53]]

classification_report:
              precision    recall  f1-score   support

           B       0.97      0.92      0.95       115
           M       0.85      0.95      0.90        56

    accuracy                           0.93       171
   macro avg       0.91      0.93      0.92       171
weighted avg       0.93      0.93      0.93       171



# Modeling with scaled data

In [20]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30,random_state=30)

In [22]:
knn = KNeighborsClassifier()
neighbors = range(3,17,2)
params = {
    'n_neighbors' : neighbors,
    'weights' : ['uniform','distance']
}

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=30)
knn_grid = GridSearchCV(knn, param_grid=params, scoring='accuracy', cv=cv, verbose=1, return_train_score=True)

In [23]:
knn_grid.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


In [24]:
knn_grid.best_params_

{'n_neighbors': 5, 'weights': 'uniform'}

In [25]:
knn_grid.score(X_train, y_train)

0.9723618090452262

In [26]:
knn_grid.score(X_test, y_test)

0.9707602339181286

In [27]:
y_pred = knn_grid.predict(X_test)
print(f'\nconfusion_matrix:\n{metrics.confusion_matrix(y_pred, y_test)}')
print(f'\nclassification_report:\n{metrics.classification_report(y_pred, y_test)}')


confusion_matrix:
[[108   4]
 [  1  58]]

classification_report:
              precision    recall  f1-score   support

           B       0.99      0.96      0.98       112
           M       0.94      0.98      0.96        59

    accuracy                           0.97       171
   macro avg       0.96      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



Decision tree Modeling

In [28]:
dt = DecisionTreeClassifier()
depth = range(3,10)
features = range(1,5)

params = {
    'criterion' : ['gini','entropy'],
    'max_depth' : depth,
    'max_features' : features,
    'max_leaf_nodes' : range(8,15),
    'random_state' : [30]
}

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=30)
dt_grid = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=cv, verbose=1, return_train_score=True)

In [29]:
dt_grid.fit(X_train, y_train)

Fitting 5 folds for each of 392 candidates, totalling 1960 fits


In [30]:
dt_grid.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 4,
 'max_leaf_nodes': 12,
 'random_state': 30}

In [31]:
dt_grid.score(X_train, y_train)

0.9773869346733668

In [32]:
dt_grid.score(X_test, y_test)

0.9298245614035088

In [33]:
y_pred = dt_grid.predict(X_test)
print(f'\nconfusion_matrix:\n{metrics.confusion_matrix(y_pred, y_test)}')
print(f'\nclassification_report:\n{metrics.classification_report(y_pred, y_test)}')


confusion_matrix:
[[106   9]
 [  3  53]]

classification_report:
              precision    recall  f1-score   support

           B       0.97      0.92      0.95       115
           M       0.85      0.95      0.90        56

    accuracy                           0.93       171
   macro avg       0.91      0.93      0.92       171
weighted avg       0.93      0.93      0.93       171

