In [1]:
import pandas as pd

In [2]:
ds = pd.read_csv("CKD.csv")

In [3]:
#Check the balanced or imbalanced values
ds["classification"].value_counts()

classification
yes    249
no     150
Name: count, dtype: int64

In [4]:
ds = pd.get_dummies(ds,drop_first=True)

In [5]:
#ds.columns

In [6]:
#Input fields
independent = ds[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv','wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes','appet_yes', 'pe_yes', 'ane_yes']]

#output fields
dependent =ds[['classification_yes']]

In [7]:
#Split the data for train test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=False)

In [8]:
#preprocessing tool used standardize features by removing the mean and scaling to unit variance. z-score normalization.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train) # x - mean/std deviation
x_test = sc.transform(x_test)

In [10]:
from sklearn.model_selection import GridSearchCV
#Model creation 
from sklearn.ensemble import RandomForestClassifier
parm_grid = {
            'n_estimators' : [10,50,100],
            'criterion' : ['gini', 'entropy', 'log_loss'],
            'max_features' : ['sqrt', 'log2'],
            'class_weight' : ['balanced', 'balanced_subsample'],
            'random_state' : [0]
            }
Classifier = GridSearchCV(RandomForestClassifier(), parm_grid, refit = True, verbose = 3, n_jobs=-1)

Classifier.fit(x_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


  return fit_method(estimator, *args, **kwargs)


In [11]:
result = Classifier.cv_results_

In [12]:
y_pred = Classifier.predict(x_test)
table = pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_features,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.031442,0.010116,0.004589,0.000143,balanced,gini,sqrt,10,0,"{'class_weight': 'balanced', 'criterion': 'gin...",0.982143,0.982143,0.910714,0.982143,0.981818,0.967792,0.028539,29
1,0.126941,0.010308,0.011099,0.003071,balanced,gini,sqrt,50,0,"{'class_weight': 'balanced', 'criterion': 'gin...",1.0,0.946429,0.928571,0.964286,1.0,0.967857,0.028571,25
2,0.285817,0.097537,0.016427,0.005116,balanced,gini,sqrt,100,0,"{'class_weight': 'balanced', 'criterion': 'gin...",1.0,0.964286,0.928571,0.964286,0.981818,0.967792,0.023651,29
3,0.058752,0.031013,0.004798,0.000647,balanced,gini,log2,10,0,"{'class_weight': 'balanced', 'criterion': 'gin...",1.0,0.946429,0.982143,0.964286,1.0,0.978571,0.020825,11
4,0.12613,0.046602,0.009449,0.001521,balanced,gini,log2,50,0,"{'class_weight': 'balanced', 'criterion': 'gin...",1.0,0.982143,0.982143,0.982143,1.0,0.989286,0.008748,1
5,0.342249,0.086804,0.01391,0.002329,balanced,gini,log2,100,0,"{'class_weight': 'balanced', 'criterion': 'gin...",1.0,0.982143,0.982143,0.964286,1.0,0.985714,0.013363,3
6,0.065741,0.030734,0.007202,0.005068,balanced,entropy,sqrt,10,0,"{'class_weight': 'balanced', 'criterion': 'ent...",0.946429,1.0,0.928571,0.982143,1.0,0.971429,0.029014,21
7,0.191088,0.07433,0.009636,0.002991,balanced,entropy,sqrt,50,0,"{'class_weight': 'balanced', 'criterion': 'ent...",0.982143,0.946429,0.928571,0.964286,1.0,0.964286,0.025254,31
8,0.322358,0.091072,0.019668,0.012989,balanced,entropy,sqrt,100,0,"{'class_weight': 'balanced', 'criterion': 'ent...",1.0,0.946429,0.928571,0.964286,1.0,0.967857,0.028571,25
9,0.046848,0.015298,0.006394,0.002568,balanced,entropy,log2,10,0,"{'class_weight': 'balanced', 'criterion': 'ent...",1.0,0.946429,0.982143,0.964286,1.0,0.978571,0.020825,11


In [13]:
# Get the metrics
from sklearn.metrics import confusion_matrix
matric = confusion_matrix(y_test,y_pred)
print(matric)

[[45  0]
 [ 0 75]]


In [14]:
# get the classification report
from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00        45
        True       1.00      1.00      1.00        75

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



In [15]:
# Key to built the model
# How well model supports for overall call model
from sklearn.metrics import f1_score
f1_macro = f1_score (y_test,y_pred,average='weighted')
print ("The best value for Parameter {}:".format(Classifier.best_params_),f1_macro)

The best value for Parameter {'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 50, 'random_state': 0}: 1.0


In [16]:
# receiver operating characteristic and Area under the curve
# How well model supports for Class 0 and class 1
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score (y_test,Classifier.predict_proba(x_test)[:,1]) # : tell all rows, take 1 ist colum of all rows for probability 
print ("The best value for Parameter {}:".format(Classifier.best_params_),roc_auc)

The best value for Parameter {'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 50, 'random_state': 0}: 1.0
