In [1]:
import pandas as pd


In [7]:
dataset = pd.read_csv("CKD.csv")

In [9]:
#since the output column is a yes/no value , it should be converted to 1 or 0 format for processing.. This can be done by map method


In [11]:
dataset['classification'] = dataset['classification'].map({'yes':1,'no':0})

In [13]:
#check if the dataset has classification values as 0 and 1
dataset.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,1
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,1
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,1
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,1
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,1


In [15]:
dataset[['classification']].value_counts() #check balanced or imbalanced through output target variable

classification
1                 249
0                 150
Name: count, dtype: int64

In [17]:
#convert the data to ordinal using pd.get dummies

In [19]:
dataset = pd.get_dummies(dataset,drop_first=True)

In [21]:
dataset.head()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,True,True,False,False,False,False,False,True,False,False
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,True,True,False,False,False,False,False,True,False,False
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,True,True,False,False,False,False,False,True,False,True
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,True,True,False,False,False,False,False,True,False,False


In [23]:
independent = dataset[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e',
       'rbc_normal', 'pc_normal', 'pcc_present', 'ba_present', 'htn_yes',
       'dm_yes', 'cad_yes', 'appet_yes', 'pe_yes', 'ane_yes']]

In [25]:
dependent = dataset[['classification']].values.ravel()

In [27]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test  = train_test_split(independent,dependent,test_size=0.3)

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [59]:
pipeline = Pipeline([
  
    ('rf',RandomForestClassifier(random_state=42))
] 
)

In [61]:
param_grid = {'rf__n_estimators': [10,20,50,100,150],
              'rf__criterion':['entropy','gini', 'log_loss'],
              'rf__max_features':['sqrt','log2', None]
             }

In [63]:
model = GridSearchCV(estimator=pipeline,param_grid=param_grid,n_jobs=-1,refit=True,verbose=3,scoring='roc_auc')

In [65]:
model.fit(X_train,y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [67]:
model.best_params_

{'rf__criterion': 'entropy',
 'rf__max_features': 'log2',
 'rf__n_estimators': 50}

In [69]:
y_pred = model.predict(X_test)

In [73]:
from sklearn.metrics import confusion_matrix

In [75]:
confusion_matrix(y_test,y_pred)

array([[37,  2],
       [ 1, 80]], dtype=int64)

In [77]:
y_proba = model.predict_proba(X_test)[:,1]

In [79]:
from sklearn.metrics import classification_report, roc_auc_score

In [81]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96        39
           1       0.98      0.99      0.98        81

    accuracy                           0.97       120
   macro avg       0.97      0.97      0.97       120
weighted avg       0.97      0.97      0.97       120



In [83]:
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

ROC AUC Score: 0.9993668882557772
