In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 30)
data = pd.read_csv('kidneyChronic.csv', na_values=["?"]);
data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [2]:
data.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [3]:
data.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       70
wbcc     105
rbcc     130
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [4]:
# numerical columns
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']

In [5]:
cate_cols = data.columns.drop('class').drop(num_cols)
cate_cols

Index(['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'], dtype='object')

In [6]:
X = data.iloc[:, 0:24].values
y = data.iloc[:, 24].values

In [8]:
from sklearn.impute import SimpleImputer
imp_numerical = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_numerical = imp_numerical.fit(X[:, 0:5])
X[:, 0:5] = imp_numerical.transform(X[:, 0:5])

imp_numerical1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_numerical1 = imp_numerical1.fit(X[:, 9:15])
X[:, 9:15] = imp_numerical1.transform(X[:, 9:15])

imp_categorical = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_categorical = imp_numerical.fit(X[:, 5:9])
X[:, 5:9] = imp_numerical.transform(X[:, 5:9])

imp_categorical = imp_numerical.fit(X[:, 9:23])
X[:, 9:23] = imp_numerical.transform(X[:, 9:23])

dff = pd.DataFrame({'age' :X[:,0], 'bp':X[:,1],'sg':X[:,2],'al':X[:,3],'su':X[:,4],'rbc':X[:,5]
                    ,'pc':X[:,6],'pcc':X[:,7],'ba':X[:,8],'bgr':X[:,9],'bu':X[:,10],'sc':X[:,11]
                    ,'sod':X[:,12],'pot':X[:,13],'hemo':X[:,14],'pcv':X[:,15],'wbcc':X[:,16],'rbcc':X[:,17]
                    ,'htn':X[:,18],'dm':X[:,19],'cad':X[:,20],'appet':X[:,21],'pe':X[:,22],'ane':X[:,23]})

dff

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane
0,48,80,1.02,1,0,normal,normal,notpresent,notpresent,121,36,1.2,135,3.5,15.4,44,7800,5.2,yes,yes,no,good,no,no
1,7,50,1.02,4,0,normal,normal,notpresent,notpresent,99,18,0.8,135,3.5,11.3,38,6000,5.2,no,no,no,good,no,no
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,53,1.8,135,3.5,9.6,31,7500,5.2,no,yes,no,poor,no,yes
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,56,3.8,111,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,26,1.4,135,3.5,11.6,35,7300,4.6,no,no,no,good,no,no
5,60,90,1.015,3,0,normal,normal,notpresent,notpresent,74,25,1.1,142,3.2,12.2,39,7800,4.4,yes,yes,no,good,yes,no
6,68,70,1.01,0,0,normal,normal,notpresent,notpresent,100,54,24,104,4,12.4,36,9800,5.2,no,no,no,good,no,no
7,24,80,1.015,2,4,normal,abnormal,notpresent,notpresent,410,31,1.1,135,3.5,12.4,44,6900,5,no,yes,no,good,yes,no
8,52,100,1.015,3,0,normal,abnormal,present,notpresent,138,60,1.9,135,3.5,10.8,33,9600,4,yes,yes,no,good,no,yes
9,53,90,1.02,2,0,abnormal,abnormal,present,notpresent,70,107,7.2,114,3.7,9.5,29,12100,3.7,yes,yes,no,poor,no,yes
