In [2]:
import pandas as pd

In [3]:
ds = pd.read_csv("CKD.csv")

In [4]:
#Check the balanced or imbalanced values
ds["classification"].value_counts()

classification
yes    249
no     150
Name: count, dtype: int64

In [5]:
ds = pd.get_dummies(ds,drop_first=True)

In [6]:
#ds.columns

In [14]:
#Input fields
independent = ds[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv','wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes','appet_yes', 'pe_yes', 'ane_yes']]

#output fields
dependent =ds[['classification_yes']]

In [15]:
#Split the data for train test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=False)

In [16]:
#preprocessing tool used standardize features by removing the mean and scaling to unit variance. z-score normalization.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train) # x - mean/std deviation
x_test = sc.transform(x_test)

In [19]:
from sklearn.model_selection import GridSearchCV
#Model creation 
from sklearn.tree import DecisionTreeClassifier
parm_grid = {
            'criterion' : ['gini', 'entropy', 'log_loss'],
            'max_features' : ['sqrt', 'log2'],
            'random_state' : [0]
            }
Classifier = GridSearchCV(DecisionTreeClassifier(), parm_grid, refit = True, verbose = 3, n_jobs=-1)

Classifier.fit(x_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [20]:
result = Classifier.cv_results_

In [21]:
y_pred = Classifier.predict(x_test)
table = pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015555,0.016339,0.012558,0.010434,gini,sqrt,0,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.982143,0.928571,0.892857,0.910714,0.963636,0.935584,0.033009,4
1,0.011682,0.014122,0.004609,0.00136,gini,log2,0,"{'criterion': 'gini', 'max_features': 'log2', ...",0.982143,0.946429,1.0,0.946429,0.981818,0.971364,0.021396,1
2,0.003967,0.00021,0.004328,0.000498,entropy,sqrt,0,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.910714,0.928571,0.892857,0.910714,1.0,0.928571,0.037457,5
3,0.003525,0.00038,0.003774,0.000253,entropy,log2,0,"{'criterion': 'entropy', 'max_features': 'log2...",0.982143,0.928571,1.0,0.964286,0.981818,0.971364,0.024194,1
4,0.003678,0.000269,0.004142,0.000957,log_loss,sqrt,0,"{'criterion': 'log_loss', 'max_features': 'sqr...",0.910714,0.928571,0.892857,0.910714,1.0,0.928571,0.037457,5
5,0.004208,0.000962,0.003526,0.000156,log_loss,log2,0,"{'criterion': 'log_loss', 'max_features': 'log...",0.982143,0.928571,1.0,0.964286,0.981818,0.971364,0.024194,1


In [22]:
# Get the metrics
from sklearn.metrics import confusion_matrix
matric = confusion_matrix(y_test,y_pred)
print(matric)

[[45  0]
 [ 4 71]]


In [23]:
# get the classification report
from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.92      1.00      0.96        45
        True       1.00      0.95      0.97        75

    accuracy                           0.97       120
   macro avg       0.96      0.97      0.97       120
weighted avg       0.97      0.97      0.97       120



In [24]:
# Key to built the model
# How well model supports for overall call model
from sklearn.metrics import f1_score
f1_macro = f1_score (y_test,y_pred,average='weighted')
print ("The best value for Parameter {}:".format(Classifier.best_params_),f1_macro)

The best value for Parameter {'criterion': 'gini', 'max_features': 'log2', 'random_state': 0}: 0.9669192655202565


In [25]:
# receiver operating characteristic and Area under the curve
# How well model supports for Class 0 and class 1
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score (y_test,Classifier.predict_proba(x_test)[:,1]) # : tell all rows, take 1 ist colum of all rows for probability 
print ("The best value for Parameter {}:".format(Classifier.best_params_),roc_auc)

The best value for Parameter {'criterion': 'gini', 'max_features': 'log2', 'random_state': 0}: 0.9733333333333334
