In [1]:
import pandas as pd
import numpy as np

In [39]:
df=pd.read_csv('chronic_kidney_disease.csv', names=['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', \
                                                    'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class'],
               na_values='?')

In [40]:
#print misssing value rate (number)
for i in df.select_dtypes(include=np.number).columns.tolist():
    n_miss = df[[i]].isnull().sum()
    perc = n_miss / df.shape[0] * 100
    print('> %s, Missing: %d (%.1f%%)' % (i, n_miss, perc))

> age, Missing: 9 (2.2%)
> bp, Missing: 12 (3.0%)
> sg, Missing: 47 (11.8%)
> al, Missing: 46 (11.5%)
> su, Missing: 49 (12.2%)
> bgr, Missing: 44 (11.0%)
> bu, Missing: 19 (4.8%)
> sc, Missing: 17 (4.2%)
> sod, Missing: 87 (21.8%)
> pot, Missing: 88 (22.0%)
> hemo, Missing: 52 (13.0%)
> pcv, Missing: 71 (17.8%)
> wc, Missing: 106 (26.5%)
> rc, Missing: 131 (32.8%)


In [41]:
#print misssing value rate (object)
for i in df.select_dtypes(include=np.object).columns.tolist():
    n_miss = df[[i]].isnull().sum()
    perc = n_miss / df.shape[0] * 100
    print('> %s, Missing: %d (%.1f%%)' % (i, n_miss, perc))

> rbc, Missing: 152 (38.0%)
> pc, Missing: 65 (16.2%)
> pcc, Missing: 4 (1.0%)
> ba, Missing: 4 (1.0%)
> htn, Missing: 2 (0.5%)
> dm, Missing: 2 (0.5%)
> cad, Missing: 2 (0.5%)
> appet, Missing: 1 (0.2%)
> pe, Missing: 1 (0.2%)
> ane, Missing: 1 (0.2%)
> class, Missing: 0 (0.0%)


In [42]:
#replace nan to mean (integer)
replace_dict = {'age': {np.nan: pd.to_numeric(df.age, errors='coerce').mean()}, 
                'bp':{np.nan: pd.to_numeric(df.bp, errors='coerce').mean()}, 
                'sg':{np.nan: pd.to_numeric(df.bp, errors='coerce').mean()}, 
                'al':{np.nan: pd.to_numeric(df.al, errors='coerce').mean()},
                'su':{np.nan: pd.to_numeric(df.su, errors='coerce').mean()},
                'bgr':{np.nan: pd.to_numeric(df.bgr, errors='coerce').mean()},
                'bu':{np.nan: pd.to_numeric(df.bu, errors='coerce').mean()},
                'sc':{np.nan: pd.to_numeric(df.sc, errors='coerce').mean()},
                'sod':{np.nan: pd.to_numeric(df.sod, errors='coerce').mean()},
                'pot':{np.nan: pd.to_numeric(df.pot, errors='coerce').mean()},
                'hemo':{np.nan: pd.to_numeric(df.hemo, errors='coerce').mean()},
                'pcv':{np.nan: pd.to_numeric(df.pcv, errors='coerce').mean()},
                'wc':{np.nan: pd.to_numeric(df.wc, errors='coerce').mean()},
                'rc':{np.nan: pd.to_numeric(df.rc, errors='coerce').mean()},
                'htn':{np.nan: pd.to_numeric(df.htn, errors='coerce').mean()},
                'dm':{np.nan: pd.to_numeric(df.dm, errors='coerce').mean()},
                'cad':{np.nan: pd.to_numeric(df.cad, errors='coerce').mean()},
                'appet':{np.nan: pd.to_numeric(df.appet, errors='coerce').mean()},
                'pe':{np.nan: pd.to_numeric(df.pe, errors='coerce').mean()},
                'ane':{np.nan: pd.to_numeric(df.ane, errors='coerce').mean()}
               }
df.replace(replace_dict, inplace=True)

In [43]:
#replace nan to most frequent (object)
for i in df.select_dtypes(include=np.object).columns.tolist():
    df[i].fillna(df[i].mode()[0], inplace=True)

age		-	age	
bp		-	blood pressure
sg		-	specific gravity
al		-   albumin
su		-	sugar
rbc		-	red blood cells
pc		-	pus cell
pcc		-	pus cell clumps
ba		-	bacteria
bgr		-	blood glucose random
bu		-	blood urea
sc		-	serum creatinine
sod		-	sodium
pot		-	potassium
hemo	-	hemoglobin
pcv		-	packed cell volume
wc		-	white blood cell count
rc		-	red blood cell count
htn		-	hypertension
dm		-	diabetes mellitus
cad		-	coronary artery disease
appet	-	appetite
pe		-	pedal edema
ane		-	anemia
class	-	class	

In [44]:
#Encoding variable (object to ordinal number)
from sklearn.preprocessing import LabelEncoder  
label_encoder= LabelEncoder()  

for i in df.select_dtypes(include=np.object).columns.tolist():
    df[i]=label_encoder.fit_transform(df[i])

In [45]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state=0)

In [47]:
from sklearn.preprocessing import StandardScaler
st_X= StandardScaler()
X_train= st_X.fit_transform(X_train)
X_train

array([[-1.50935703, -1.18319374, -0.37787   , ..., -0.51460872,
        -0.4853053 , -0.3992747 ],
       [ 0.23212848,  0.28483932, -0.37827075, ..., -0.51460872,
        -0.4853053 , -0.3992747 ],
       [-0.92886186,  0.28483932, -0.37847113, ..., -0.51460872,
         2.06055857, -0.3992747 ],
       ...,
       [ 0.02799015, -0.44917721, -0.37787   , ..., -0.51460872,
        -0.4853053 , -0.3992747 ],
       [-2.32205027,  0.28483932, -0.37827075, ..., -0.51460872,
        -0.4853053 , -0.3992747 ],
       [ 0.63847509,  0.28483932, -0.37827075, ..., -0.51460872,
        -0.4853053 , -0.3992747 ]])

In [48]:
X_test= st_X.transform(X_test)
X_test

array([[-5.81191096e-02,  2.56633840e-02,  2.64575125e+00, ...,
        -5.14608720e-01,  2.06055857e+00,  2.50454133e+00],
       [-6.95926115e-05, -1.18319374e+00, -3.77870002e-01, ...,
        -5.14608720e-01, -4.85305301e-01, -3.99274705e-01],
       [ 6.96524611e-01, -4.49177207e-01, -3.77669627e-01, ...,
        -5.14608720e-01, -4.85305301e-01, -3.99274705e-01],
       ...,
       [-1.04496090e+00,  2.84839324e-01, -3.77669627e-01, ...,
        -5.14608720e-01, -4.85305301e-01, -3.99274705e-01],
       [-1.27715897e+00,  2.84839324e-01, -3.77870002e-01, ...,
        -5.14608720e-01, -4.85305301e-01, -3.99274705e-01],
       [-1.04496090e+00, -1.18319374e+00, -3.77669627e-01, ...,
        -5.14608720e-01, -4.85305301e-01, -3.99274705e-01]])