In [62]:
#Dataset: Chronic_Kidney_Disease (https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease#) 
#
#Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. 
#Irvine, CA: University of California, School of Information and Computer Science.

import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [22]:
# wczytanie danych z pliku csv przygotowanego na podstawie oryginalnego pliku arff
df = pd.read_csv('chronic_kidney_disease.csv')

In [23]:
# oznaczenie wartości brakujących przez -1
df.replace('?', -1, inplace=True)

In [29]:
# przygotowanie danych: dane numeryczne jako int lub float,nominalne zakodowane przez factorize
df['age'] = df['age'].astype('str').astype('int')
df['bp'] = df['bp'].astype('str').astype('int')
df['sg_cat'] = df['sg'].factorize()[0]
df['al_cat'] = df['al'].factorize()[0]
df['su_cat'] = df['su'].factorize()[0]
df['rbc_cat'] = df['rbc'].factorize()[0]
df['pc_cat'] = df['pc'].factorize()[0]
df['pcc_cat'] = df['pcc'].factorize()[0]
df['ba_cat'] = df['ba'].factorize()[0]
df['bgr'] = df['bgr'].astype('float')
df['bu'] = df['bu'].astype('float')
df['sc'] = df['sc'].astype('float')
df['sod'] = df['sod'].astype('float')
df['pot'] = df['pot'].astype('float')
df['hemo'] = df['hemo'].astype('float')
df['pcv'] = df['pcv'].astype('float')
df['wbcc'] = df['wbcc'].astype('float')
df['rbcc'] = df['rbcc'].astype('float')
df['htn_cat'] = df['htn'].factorize()[0]
df['dm_cat'] = df['dm'].factorize()[0]
df['cad_cat'] = df['cad'].factorize()[0]
df['appet_cat'] = df['appet'].factorize()[0]
df['pe_cat'] = df['pe'].factorize()[0]
df['ane_cat'] = df['ane'].factorize()[0]
df['class_cat'] = df['class'].factorize()[0]

In [55]:
# wybór cech
feats=['age','bp','sg_cat','al_cat','su_cat','rbc_cat','pc_cat',
       'pcc_cat','ba_cat','bgr','bu','sc','sod','pot','hemo','pcv',
       'wbcc','rbcc','htn_cat','dm_cat','cad_cat','appet_cat','pe_cat','ane_cat']

In [57]:
df.replace(-1, np.nan, inplace=True)

In [66]:
df[feats].head()

Unnamed: 0,age,bp,sg_cat,al_cat,su_cat,rbc_cat,pc_cat,pcc_cat,ba_cat,bgr,...,hemo,pcv,wbcc,rbcc,htn_cat,dm_cat,cad_cat,appet_cat,pe_cat,ane_cat
0,48.0,80.0,0,0,0,0,0,0,0,121.0,...,15.4,44.0,7800.0,5.2,0,0,0,0,0,0
1,7.0,50.0,0,1,0,0,0,0,0,,...,11.3,38.0,6000.0,,1,1,0,0,0,0
2,62.0,80.0,1,2,1,1,0,0,0,423.0,...,9.6,31.0,7500.0,,1,0,0,1,0,1
3,48.0,70.0,2,1,0,1,1,1,0,117.0,...,11.2,32.0,6700.0,3.9,0,1,0,1,1,1
4,51.0,80.0,1,2,0,1,0,0,0,106.0,...,11.6,35.0,7300.0,4.6,1,1,0,0,0,0


In [69]:
df[feats].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        391 non-null    float64
 1   bp         388 non-null    float64
 2   sg_cat     400 non-null    int64  
 3   al_cat     400 non-null    int64  
 4   su_cat     400 non-null    int64  
 5   rbc_cat    400 non-null    int64  
 6   pc_cat     400 non-null    int64  
 7   pcc_cat    400 non-null    int64  
 8   ba_cat     400 non-null    int64  
 9   bgr        356 non-null    float64
 10  bu         381 non-null    float64
 11  sc         383 non-null    float64
 12  sod        313 non-null    float64
 13  pot        312 non-null    float64
 14  hemo       348 non-null    float64
 15  pcv        329 non-null    float64
 16  wbcc       294 non-null    float64
 17  rbcc       269 non-null    float64
 18  htn_cat    400 non-null    int64  
 19  dm_cat     400 non-null    int64  
 20  cad_cat   

In [71]:
X = df[feats].values
y = df['class_cat'].values

In [72]:
# podział danych na zbiór uczący i testowy
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [73]:
# utworzenie i trening modelu
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# weryfikacja jakości uczenia na danych testowych
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.83%
