# ML workflow for  Chronic Kidney Disease

In [1]:
# Imports 
from pipeline_perso import *
from sklearn.pipeline import Pipeline

In [2]:
dtypes = {'pcv':'float64','wc':'float64','rc':'float64'}
data = pd.read_csv('kidney_disease.csv',dtype=dtypes,na_values='\t?')
data = data.dropna()

In [3]:
data

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
9,9,53.0,90.0,1.020,2.0,0.0,abnormal,abnormal,present,notpresent,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd
11,11,63.0,70.0,1.010,3.0,0.0,abnormal,abnormal,present,notpresent,...,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
14,14,68.0,80.0,1.010,3.0,2.0,normal,abnormal,present,present,...,16.0,11000.0,2.6,yes,yes,yes,poor,yes,no,ckd
20,20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,...,24.0,9200.0,3.2,yes,yes,yes,poor,yes,yes,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [4]:
list(data.columns)

['id',
 'age',
 'bp',
 'sg',
 'al',
 'su',
 'rbc',
 'pc',
 'pcc',
 'ba',
 'bgr',
 'bu',
 'sc',
 'sod',
 'pot',
 'hemo',
 'pcv',
 'wc',
 'rc',
 'htn',
 'dm',
 'cad',
 'appet',
 'pe',
 'ane',
 'classification']

In [5]:
data.reset_index(drop=True,inplace=True)
data.drop(['id'],axis=1,inplace=True)

In [6]:
data[data.columns[data.dtypes=='object']]

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
1,abnormal,abnormal,present,notpresent,yes,yes,no,poor,no,yes,ckd
2,abnormal,abnormal,present,notpresent,yes,yes,no,poor,yes,no,ckd
3,normal,abnormal,present,present,yes,yes,yes,poor,yes,no,ckd
4,abnormal,abnormal,notpresent,notpresent,yes,yes,yes,poor,yes,yes,ckd
...,...,...,...,...,...,...,...,...,...,...,...
153,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
154,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
155,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
156,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd


In [7]:
data.dtypes

age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv               float64
wc                float64
rc                float64
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [8]:
TrainTest=TrainTestGenerator(1,test_size=0.3,data=data)
for train,test in TrainTest:
    X_train = np.array(data.drop(['classification'],axis=1))[train]
    X_test  = np.array(data.drop(['classification'],axis=1))[test]
    y_train = np.array(data.classification)[train]
    y_test  = np.array(data.classification)[test]

In [13]:
from sklearn.model_selection import KFold,GridSearchCV
kf = KFold(n_splits=4, random_state=None, shuffle=True)

In [10]:
from catboost import CatBoostClassifier
parameters = {'depth': [6,8,10],'learning_rate' : [0.01, 0.05, 0.1],'iterations'    : [30, 50, 100]}

grid_perso = GridSearchHyperParamsCV(model=CatBoostClassifier(), parameters = parameters, cv_splitter = kf, n_jobs=-1 ,verbose=10)

grid = GridSearchCV(estimator=CatBoostClassifier(), param_grid = parameters, cv = kf, n_jobs=-1)

pipe_perso = Pipeline([('cat_trans', CategoricalTransformer(strategy='ordinal_encoding')), ('catboost', grid_perso)])

pipe = Pipeline([('cat_trans', CategoricalTransformer(strategy='ordinal_encoding')), ('catboost', grid)])

pipe_perso.fit(X_train,y_train)

pipe.fit(X_train,y_train)

print(pipe_perso['catboost']._best_score)
print(pipe_perso['catboost']._best_params)



print(pipe.score(X_test,y_test))
print(pipe['catboost'].best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  23 out of  27 | elapsed:    5.3s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1465s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  20 out of  27 | elapsed:    2.0s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    3.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1443s.) Setting batch_

0:	learn: 0.6742812	total: 53.1ms	remaining: 1.54s
1:	learn: 0.6583270	total: 56ms	remaining: 784ms
2:	learn: 0.6427431	total: 58.1ms	remaining: 523ms
3:	learn: 0.6279857	total: 59.6ms	remaining: 388ms
4:	learn: 0.6131546	total: 60.8ms	remaining: 304ms
5:	learn: 0.6006363	total: 61.8ms	remaining: 247ms
6:	learn: 0.5865696	total: 62.8ms	remaining: 206ms
7:	learn: 0.5697080	total: 63.9ms	remaining: 176ms
8:	learn: 0.5550653	total: 64.9ms	remaining: 151ms
9:	learn: 0.5396684	total: 66ms	remaining: 132ms
10:	learn: 0.5269447	total: 67.1ms	remaining: 116ms
11:	learn: 0.5104881	total: 67.8ms	remaining: 102ms
12:	learn: 0.4976429	total: 68.9ms	remaining: 90.1ms
13:	learn: 0.4842989	total: 70ms	remaining: 80ms
14:	learn: 0.4697757	total: 70.5ms	remaining: 70.5ms
15:	learn: 0.4585434	total: 71.5ms	remaining: 62.6ms
16:	learn: 0.4470326	total: 72.6ms	remaining: 55.5ms
17:	learn: 0.4342732	total: 73.6ms	remaining: 49.1ms
18:	learn: 0.4238677	total: 74.6ms	remaining: 43.2ms
19:	learn: 0.4113671	to

In [14]:
from sklearn.svm import SVC
parameters =  {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }

grid_perso = GridSearchHyperParamsCV(model=SVC(), parameters = parameters, cv_splitter = kf, n_jobs=-1 ,verbose=10)

grid = GridSearchCV(estimator=SVC(), param_grid = parameters, cv = kf, n_jobs=-1)

pipe_perso = Pipeline([('cat_trans', CategoricalTransformer(strategy='ordinal_encoding')), ('svc', grid_perso)])

pipe = Pipeline([('cat_trans', CategoricalTransformer(strategy='ordinal_encoding')), ('svc', grid)])

pipe_perso.fit(X_train,y_train)

pipe.fit(X_train,y_train)

print(pipe_perso['svc']._best_score)
print(pipe_perso['svc']._best_params)



print(pipe.score(X_test,y_test))
print(pipe['svc'].best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1706s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    2.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0064s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0164s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   

0.7562830687830687
{'C': 1000.0, 'gamma': 0.0001}
0.7021276595744681
{'C': 1000.0, 'gamma': 0.001}


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.0s finished
