In [234]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  classification_report,  ConfusionMatrixDisplay
from matplotlib.pylab import rcParams
#!pip install liac-arff
import arff

# 1 Preprocessing

### Replacing strings in the arff and creating a df

Especially this line is important:

`line.replace('	', '').replace(',,',',').replace('ckd,\n', 'ckd\n').replace(' yes','yes')`

otherwise, the file could not be opened

In [235]:
fin = open("data/chronic_kidney_disease_full.arff", "rt")

#output file to write the result to
fout = open("processed/chronic_kidney_disease_full.arff", "wt")
#for each line in the input file
for line in fin:
	#read replace the string and write to output file
	fout.write(line.replace('	', '').replace(',,',',').replace('ckd,\n', 'ckd\n').replace(' ckd', 'ckd').replace(' yes','yes'))
#close input and output files
fin.close()

dataset = arff.load(open("processed/chronic_kidney_disease_full.arff", "r"))
print(dataset['description'])

1. Title: Early stage of Indians Chronic Kidney Disease(CKD)

2. Source Information:
  (a) Source:
Dr.P.Soundarapandian.M.D.,D.M
    (Senior Consultant Nephrologist),
Apollo  Hospitals,
Managiri,
Madurai Main Road,
Karaikudi,
Tamilnadu,
India.
  (b) Creator:
L.Jerlin Rubini(Research Scholar)
Alagappa University
EmailId   :jel.jerlin@gmail.com
ContactNo :+91-9597231281
  (c) Guided by:
Dr.P.Eswaran Assistant Professor,
Department of Computer Science and Engineering,
Alagappa University,
Karaikudi,
Tamilnadu,
India.
Emailid:eswaranperumal@gmail.com
  (d) Date     : july 2015

3.Relevant Information:
age-age
bp-blood pressure
sg-specific gravity
al-   albumin
su-sugar
rbc-red blood cells
pc-pus cell
pcc-pus cell clumps
ba-bacteria
bgr-blood glucose random
bu-blood urea
sc-serum creatinine
sod-sodium
pot-potassium
hemo-hemoglobin
pcv-packed cell volume
wc-white blood cell count
rc-red blood cell count
htn-hypertension
dm-diabetes mellitus
cad-coronary artery disease
appet-appetite
pe-pedal

In [236]:

name_map = {
    'age':      'age',
    'bp':       'blood_pressure',
    'sg':       'specific_gravity',
    'al':       'albumin',
    'su':       'sugar',
    'rbc':      'red_blood_cells',
    'pc':       'pus_cell',
    'pcc':      'pus_cell_clumps',
    'ba':       'bacteria',
    'bgr':      'blood_glucose_random',
    'bu':       'blood_urea',
    'sc':       'serum_creatinine',
    'sod':      'sodium',
    'pot':      'potassium',
    'hemo':     'hemoglobin',
    'pcv':      'packed_cell_volume',
    'wbcc':     'white_blood_cell_count',
    'rbcc':     'red_blood_cell_count',
    'htn':      'hypertension',
    'dm':       'diabetes_mellitus',
    'cad':      'coronary_artery_disease',
    'appet':    'appetite',
    'pe':       'pedal_edema',
    'ane':      'anemia',
    'class':    'class',
}

# rename columns
dataset['attributes'] = [(name_map[attr[0]], attr[1]) for attr in dataset['attributes']]

numeric_columns = []
categorical_columns = []
column_names = []


for idx, attr in enumerate(dataset['attributes']):
    column_names.append(attr[0])
    if attr[1] == 'NUMERIC':
        numeric_columns.append(attr[0])
    else:
        categorical_columns.append(attr[0])
    print(attr[0], '\n', attr[1])
    

print('\nnumeric:',numeric_columns)
print('categorical:',categorical_columns)

        

age 
 NUMERIC
blood_pressure 
 NUMERIC
specific_gravity 
 ['1.005', '1.010', '1.015', '1.020', '1.025']
albumin 
 ['0', '1', '2', '3', '4', '5']
sugar 
 ['0', '1', '2', '3', '4', '5']
red_blood_cells 
 ['normal', 'abnormal']
pus_cell 
 ['normal', 'abnormal']
pus_cell_clumps 
 ['present', 'notpresent']
bacteria 
 ['present', 'notpresent']
blood_glucose_random 
 NUMERIC
blood_urea 
 NUMERIC
serum_creatinine 
 NUMERIC
sodium 
 NUMERIC
potassium 
 NUMERIC
hemoglobin 
 NUMERIC
packed_cell_volume 
 NUMERIC
white_blood_cell_count 
 NUMERIC
red_blood_cell_count 
 NUMERIC
hypertension 
 ['yes', 'no']
diabetes_mellitus 
 ['yes', 'no']
coronary_artery_disease 
 ['yes', 'no']
appetite 
 ['good', 'poor']
pedal_edema 
 ['yes', 'no']
anemia 
 ['yes', 'no']
class 
 ['ckd', 'notckd']

numeric: ['age', 'blood_pressure', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium', 'potassium', 'hemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count']
categorical: ['spec

In [237]:
# save numeric and categorical columns to file
np.savetxt('processed/categorical_columns.txt', categorical_columns, fmt='%s')
np.savetxt('processed/numerical_columns.txt', numeric_columns, fmt='%s')

In [238]:
df = pd.DataFrame(dataset['data'], columns=column_names)
df.to_csv('processed/df_raw.csv', index=False)  
df

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia,class
0,48.0,80.0,1.020,1,0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4,0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2,3,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2,0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0,0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0,0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0,0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0,0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [239]:
def preserve_cat(df: pd.DataFrame):
    dataframe = df.copy()
    dataframe.replace('yes', True, inplace=True) # ['htn', 'dm', 'cad', 'pe', 'ane']
    dataframe.replace('no', False, inplace=True) # ['htn', 'dm', 'cad', 'pe', 'ane']

    # dataframe.replace('present', 1, inplace=True) # ['pcc', 'ba']
    # dataframe.replace('notpresent', 0, inplace=True) # ['pcc', 'ba']

    # dataframe.replace('good', 1, inplace=True) # ['appet']
    # dataframe.replace('poor', 0, inplace=True) # ['appet']

    dataframe.replace('ckd', True, inplace=True) # ['class']
    dataframe.replace('notckd', False, inplace=True) # ['class']
    
    # dataframe.replace('normal', 1, inplace=True) # ['rbc', 'pc']
    # dataframe.replace('abnormal', 0, inplace=True) # ['rbc', 'pc']
    
    dataframe['specific_gravity'].replace('1.005', '_1.005', inplace=True)
    dataframe['specific_gravity'].replace('1.010', '_1.010', inplace=True)
    dataframe['specific_gravity'].replace('1.015', '_1.015', inplace=True)
    dataframe['specific_gravity'].replace('1.020', '_1.020', inplace=True)
    dataframe['specific_gravity'].replace('1.025', '_1.025', inplace=True)
    
    dataframe['albumin'].replace('0', '_0', inplace=True)
    dataframe['albumin'].replace('1', '_1', inplace=True)
    dataframe['albumin'].replace('2', '_2', inplace=True)
    dataframe['albumin'].replace('3', '_3', inplace=True)
    dataframe['albumin'].replace('4', '_4', inplace=True)
    dataframe['albumin'].replace('5', '_5', inplace=True)
    
    dataframe['sugar'].replace('0', '_0', inplace=True)
    dataframe['sugar'].replace('1', '_1', inplace=True)
    dataframe['sugar'].replace('2', '_2', inplace=True)
    dataframe['sugar'].replace('3', '_3', inplace=True)
    dataframe['sugar'].replace('4', '_4', inplace=True)
    dataframe['sugar'].replace('5', '_5', inplace=True)
    
    return dataframe

df_cat_preserved = preserve_cat(df)


def print_range_of_values(dataframe):
    for col in dataframe.columns:
        print(col, '\n',dataframe[col].unique())


print_range_of_values(df_cat_preserved[categorical_columns])

specific_gravity 
 ['_1.020' '_1.010' '_1.005' '_1.015' None '_1.025']
albumin 
 ['_1' '_4' '_2' '_3' '_0' None '_5']
sugar 
 ['_0' '_3' '_4' '_1' None '_2' '_5']
red_blood_cells 
 [None 'normal' 'abnormal']
pus_cell 
 ['normal' 'abnormal' None]
pus_cell_clumps 
 ['notpresent' 'present' None]
bacteria 
 ['notpresent' 'present' None]
hypertension 
 [True False None]
diabetes_mellitus 
 [True False None]
coronary_artery_disease 
 [False True None]
appetite 
 ['good' 'poor' None]
pedal_edema 
 [False True None]
anemia 
 [False True None]
class 
 [ True False]


In [240]:

# one-hot encoding preserving nan values
df_encoded = pd.get_dummies(df_cat_preserved, dummy_na=True)

## save to csv
df_encoded.to_csv('processed/df_encoded.csv', index=False)

In [241]:
print(df_encoded.columns)

Index(['age', 'blood_pressure', 'blood_glucose_random', 'blood_urea',
       'serum_creatinine', 'sodium', 'potassium', 'hemoglobin',
       'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
       'class', 'specific_gravity__1.005', 'specific_gravity__1.010',
       'specific_gravity__1.015', 'specific_gravity__1.020',
       'specific_gravity__1.025', 'specific_gravity_nan', 'albumin__0',
       'albumin__1', 'albumin__2', 'albumin__3', 'albumin__4', 'albumin__5',
       'albumin_nan', 'sugar__0', 'sugar__1', 'sugar__2', 'sugar__3',
       'sugar__4', 'sugar__5', 'sugar_nan', 'red_blood_cells_abnormal',
       'red_blood_cells_normal', 'red_blood_cells_nan', 'pus_cell_abnormal',
       'pus_cell_normal', 'pus_cell_nan', 'pus_cell_clumps_notpresent',
       'pus_cell_clumps_present', 'pus_cell_clumps_nan', 'bacteria_notpresent',
       'bacteria_present', 'bacteria_nan', 'hypertension_False',
       'hypertension_True', 'hypertension_nan', 'diabetes_mellitus_Fals

In [242]:
df_encoded.head()

Unnamed: 0,age,blood_pressure,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,hemoglobin,packed_cell_volume,white_blood_cell_count,...,coronary_artery_disease_nan,appetite_good,appetite_poor,appetite_nan,pedal_edema_False,pedal_edema_True,pedal_edema_nan,anemia_False,anemia_True,anemia_nan
0,48.0,80.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,...,0,1,0,0,1,0,0,1,0,0
1,7.0,50.0,,18.0,0.8,,,11.3,38.0,6000.0,...,0,1,0,0,1,0,0,1,0,0
2,62.0,80.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,...,0,0,1,0,1,0,0,0,1,0
3,48.0,70.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,...,0,0,1,0,0,1,0,0,1,0
4,51.0,80.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,...,0,1,0,0,1,0,0,1,0,0


### Encode to a numeric representation


In [243]:
def numer(df: pd.DataFrame):
    dataframe = df.copy()
    dataframe.replace('yes', 1, inplace=True) # ['htn', 'dm', 'cad', 'pe', 'ane']
    dataframe.replace('no', 0, inplace=True) # ['htn', 'dm', 'cad', 'pe', 'ane']

    dataframe.replace('present', 1, inplace=True) # ['pcc', 'ba']
    dataframe.replace('notpresent', 0, inplace=True) # ['pcc', 'ba']

    dataframe.replace('good', 1, inplace=True) # ['appet']
    dataframe.replace('poor', 0, inplace=True) # ['appet']

    dataframe.replace('ckd', 1, inplace=True) # ['class']
    dataframe.replace('notckd', 0, inplace=True) # ['class']
    
    dataframe.replace('normal', 1, inplace=True) # ['rbc', 'pc']
    dataframe.replace('abnormal', 0, inplace=True) # ['rbc', 'pc']
    
    return dataframe

df_numeric = numer(df)
df_numeric


def print_range_of_values(dataframe):
    for col in dataframe.columns:
        print(col, '\n',dataframe[col].unique())


print_range_of_values(df_numeric[categorical_columns])

specific_gravity 
 ['1.020' '1.010' '1.005' '1.015' None '1.025']
albumin 
 ['1' '4' '2' '3' '0' None '5']
sugar 
 ['0' '3' '4' '1' None '2' '5']
red_blood_cells 
 [nan  1.  0.]
pus_cell 
 [ 1.  0. nan]
pus_cell_clumps 
 [ 0.  1. nan]
bacteria 
 [ 0.  1. nan]
hypertension 
 [ 1.  0. nan]
diabetes_mellitus 
 [ 1.  0. nan]
coronary_artery_disease 
 [ 0.  1. nan]
appetite 
 [ 1.  0. nan]
pedal_edema 
 [ 0.  1. nan]
anemia 
 [ 0.  1. nan]
class 
 [1 0]


In [244]:
## save to csv
df_numeric.to_csv('processed/df_numeric.csv', index=False)

# Output of this notebook

### df
The df dataset contains more or less the original dataset, except the columns which contained "Yes" / "No" were replaced by booleans, as well as the class variable.

### df_encoded
We created a dataset with the encoded categorical variables. This dataset contains all categorical values as one_hot encodings. 

### df_numeric
We created a dataset where all variables are represented numerically. The categorical values could be turned into booleans (which were encoded as 0 and 1), while other categorical variables were already encoded by numbers.
