In [0]:
import pandas as pd   #import pandas
import numpy as np    #import numpy
from sklearn.preprocessing import LabelEncoder #import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [0]:
#read the data
data = pd.read_csv('/content/drive/My Drive/kidney_disease.csv')

In [0]:
#check the shape of the data
print('Dataset has {} number of rows and {} Number of columns'.format ( data.shape[0], data.shape[1]) ) 

Dataset has 400 number of rows and 26 Number of columns


In [0]:
#check the head of the data
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [0]:
#check the information of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [0]:
 catg_col = data[['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']]
 catg_col.head(10)

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
5,,,notpresent,notpresent,yes,yes,no,good,yes,no,ckd
6,,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
7,normal,abnormal,notpresent,notpresent,no,yes,no,good,yes,no,ckd
8,normal,abnormal,present,notpresent,yes,yes,no,good,no,yes,ckd
9,abnormal,abnormal,present,notpresent,yes,yes,no,poor,no,yes,ckd


In [0]:
catg_col['classification'].value_counts()

ckd       248
notckd    150
ckd\t       2
Name: classification, dtype: int64

In [0]:
#replace ckd\t with notckd they are just missispelled
catg_col['classification'] = catg_col['classification'].replace('ckd\t','notckd')

In [0]:
catg_col['classification'].value_counts() #now check the value counts

ckd       248
notckd    152
Name: classification, dtype: int64

classification is our target variable here. We have 248 patients with Chronic kidney dieseas and 152 Not Chronic Kindey Diseases.

Our target variable is itself into categoric form. We have to convert it first. Let's assign ckd as 1 and notckd as 0.

In [0]:
#apply LabelEncoder
le = LabelEncoder()
c_en = le.fit_transform(catg_col['classification'])
print(c_en[ :10])
list(catg_col['classification'].head(10))

[0 0 0 0 0 0 0 0 0 0]


['ckd', 'ckd', 'ckd', 'ckd', 'ckd', 'ckd', 'ckd', 'ckd', 'ckd', 'ckd']

As expected we know that LabelEncoding assigns the ranks alphabatically. We wanted to assign ckd as 1 because they are confirmed cases with chronic kidney dieseas and notckd as 0. 

Encode using map

In [0]:
#make a dictionary
class_en = {'ckd' : 1, 'notckd' : 0}
catg_col['classification_en'] = catg_col['classification'].map(class_en)
catg_col['classification_en'].head(10)

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: classification_en, dtype: int64

In [0]:
#drop the classification column
catg_col.drop('classification',axis = 1, inplace=True)

In [0]:
#check null values
catg_col.isnull().sum()

rbc                  152
pc                    65
pcc                    4
ba                     4
htn                    2
dm                     2
cad                    2
appet                  1
pe                     1
ane                    1
classification_en      0
dtype: int64

In [0]:
#fill the null values with the most frequent values
n_col = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for i in n_col:
  catg_col[i].fillna(catg_col[i].mode()[0],inplace=True)

Now let's check the predictor variables and identify them as nominal or ordinal

In [0]:
#check the unique values in each columns
for i in catg_col:
  print(i + '    ',catg_col[i].unique())

rbc     ['normal' 'abnormal']
pc     ['normal' 'abnormal']
pcc     ['notpresent' 'present']
ba     ['notpresent' 'present']
htn     ['yes' 'no']
dm     ['yes' 'no' ' yes' '\tno' '\tyes']
cad     ['no' 'yes' '\tno']
appet     ['good' 'poor']
pe     ['no' 'yes']
ane     ['no' 'yes']
classification_en     [1 0]


Most of the columns has 2 labels. We can either use LabelEncoding or One Hot Encoding. as we have only 2 labels they will assign 1 and 0 only to the labels.


Before applying feature encoding on these columns. If we look properly we need to do some cleaning here. There are values present as '\tno', '\tyes' in dm and cad. There appears to be a typo error. Wo e have to replace '\tno' with no and 'tyes' with yes.

In [0]:
#clean and strip the values
catg_col['dm'] = catg_col['dm'].str.strip()
catg_col['cad'] = catg_col['dm'].str.strip()

Apply One Hot Encoding on all the nominal predictor variables as there are only 2 labels in each features. If you wish you can also apply LabelEncoding here.

In [0]:
catg_col.drop('classification_en',axis=1,inplace=True)  #drop the classification_en column as we have already encoded it.

In [0]:
#apply pd.get_dummies and keep drop_first=True
pd.get_dummies(catg_col,drop_first=True)

Unnamed: 0,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_poor,pe_yes,ane_yes
0,1,1,0,0,1,1,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,1,1,1,0,1
3,1,0,1,0,1,0,0,1,1,1
4,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
395,1,1,0,0,0,0,0,0,0,0
396,1,1,0,0,0,0,0,0,0,0
397,1,1,0,0,0,0,0,0,0,0
398,1,1,0,0,0,0,0,0,0,0


Awesome! We have encoded the categorical feature. Now our features are ready to go into modelling.