In [3]:
import pandas as pd

In [4]:
ds = pd.read_csv("CKD.csv")

In [5]:
#Check the balanced or imbalanced values
ds["classification"].value_counts()

classification
yes    249
no     150
Name: count, dtype: int64

In [6]:
ds = pd.get_dummies(ds,drop_first=True)

In [7]:
#ds.columns

In [8]:
#Input fields
independent = ds[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv','wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes','appet_yes', 'pe_yes', 'ane_yes']]

#output fields
dependent =ds[['classification_yes']]

In [9]:
#Split the data for train test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=False)

In [10]:
#preprocessing tool used standardize features by removing the mean and scaling to unit variance. z-score normalization.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train) # x - mean/std deviation
x_test = sc.transform(x_test)

In [12]:
from sklearn.model_selection import GridSearchCV
#Model creation 
from sklearn.neighbors import KNeighborsClassifier
parm_grid = {
            'n_neighbors': range(1, 31),  # Explore n_neighbors from 1 to 30
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan'],
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
            }
Classifier = GridSearchCV(KNeighborsClassifier(), parm_grid, refit = True, verbose = 3, n_jobs=-1)

Classifier.fit(x_train,y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  return self._fit(X, y)


In [13]:
result = Classifier.cv_results_

In [14]:
y_pred = Classifier.predict(x_test)
table = pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011907,0.005569,0.008440,0.000691,auto,euclidean,1,uniform,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.982143,1.000000,0.964286,0.982143,1.000000,0.985714,0.013363,1
1,0.014384,0.006366,0.004890,0.001008,auto,euclidean,1,distance,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.982143,1.000000,0.964286,0.982143,1.000000,0.985714,0.013363,1
2,0.006922,0.008184,0.007984,0.000068,auto,euclidean,2,uniform,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.964286,0.964286,0.928571,0.964286,0.981818,0.960649,0.017417,77
3,0.011700,0.012689,0.004162,0.000129,auto,euclidean,2,distance,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.982143,1.000000,0.964286,0.982143,1.000000,0.985714,0.013363,1
4,0.007728,0.009880,0.008065,0.000300,auto,euclidean,3,uniform,"{'algorithm': 'auto', 'metric': 'euclidean', '...",0.964286,0.982143,0.928571,0.964286,1.000000,0.967857,0.023690,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0.014303,0.017505,0.004046,0.000812,brute,manhattan,28,distance,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.875000,0.928571,0.910714,0.928571,0.945455,0.917662,0.023995,309
476,0.002892,0.001033,0.003591,0.000432,brute,manhattan,29,uniform,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.839286,0.910714,0.892857,0.928571,0.945455,0.903377,0.036544,381
477,0.002540,0.000434,0.004425,0.001485,brute,manhattan,29,distance,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.875000,0.928571,0.892857,0.928571,0.945455,0.914091,0.026008,337
478,0.004244,0.001542,0.004332,0.000516,brute,manhattan,30,uniform,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.839286,0.910714,0.857143,0.910714,0.945455,0.892662,0.038857,409


In [15]:
# Get the metrics
from sklearn.metrics import confusion_matrix
matric = confusion_matrix(y_test,y_pred)
print(matric)

[[45  0]
 [ 2 73]]


In [16]:
# get the classification report
from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.96      1.00      0.98        45
        True       1.00      0.97      0.99        75

    accuracy                           0.98       120
   macro avg       0.98      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120



In [17]:
# Key to built the model
# How well model supports for overall call model
from sklearn.metrics import f1_score
f1_macro = f1_score (y_test,y_pred,average='weighted')
print ("The best value for Parameter {}:".format(Classifier.best_params_),f1_macro)

The best value for Parameter {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}: 0.9834018801410106


In [18]:
# receiver operating characteristic and Area under the curve
# How well model supports for Class 0 and class 1
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score (y_test,Classifier.predict_proba(x_test)[:,1]) # : tell all rows, take 1 ist colum of all rows for probability 
print ("The best value for Parameter {}:".format(Classifier.best_params_),roc_auc)

The best value for Parameter {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}: 0.9866666666666667
