In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  classification_report,  ConfusionMatrixDisplay
from matplotlib.pylab import rcParams
#pip install liac-arff
import arff


# Exploratory data analysis
## Preprocessing
### Replacing strings in the arff

Especially this line is important:

`line.replace('	', '').replace(',,',',').replace('ckd,\n', 'ckd\n').replace(' yes','yes')`

otherwise, the file could not be opened

In [13]:
fin = open("data/chronic_kidney_disease_full.arff", "rt")
#output file to write the result to
fout = open("preprocessed/chronic_kidney_disease_full.arff", "wt")
#for each line in the input file
for line in fin:
	#read replace the string and write to output file
	fout.write(line.replace('	', '').replace(',,',',').replace('ckd,\n', 'ckd\n').replace(' yes','yes'))
#close input and output files
fin.close()

dataset = arff.load(open("preprocessed/chronic_kidney_disease_full.arff", "r"))
print(dataset['description'])

1. Title: Early stage of Indians Chronic Kidney Disease(CKD)

2. Source Information:
  (a) Source:
Dr.P.Soundarapandian.M.D.,D.M
    (Senior Consultant Nephrologist),
Apollo  Hospitals,
Managiri,
Madurai Main Road,
Karaikudi,
Tamilnadu,
India.
  (b) Creator:
L.Jerlin Rubini(Research Scholar)
Alagappa University
EmailId   :jel.jerlin@gmail.com
ContactNo :+91-9597231281
  (c) Guided by:
Dr.P.Eswaran Assistant Professor,
Department of Computer Science and Engineering,
Alagappa University,
Karaikudi,
Tamilnadu,
India.
Emailid:eswaranperumal@gmail.com
  (d) Date     : july 2015

3.Relevant Information:
age-age
bp-blood pressure
sg-specific gravity
al-   albumin
su-sugar
rbc-red blood cells
pc-pus cell
pcc-pus cell clumps
ba-bacteria
bgr-blood glucose random
bu-blood urea
sc-serum creatinine
sod-sodium
pot-potassium
hemo-hemoglobin
pcv-packed cell volume
wc-white blood cell count
rc-red blood cell count
htn-hypertension
dm-diabetes mellitus
cad-coronary artery disease
appet-appetite
pe-pedal

## Investigating the attributes

In [53]:
numeric_columns = []
categorical_columns = []
column_names = []

for idx, attr in enumerate(dataset['attributes']):
    column_names.append(attr[0])
    if attr[1] == 'NUMERIC':
        numeric_columns.append(attr[0])
    else:
        categorical_columns.append(attr[0])
    print(attr[0], attr[1])
    

print('\nnumeric:',numeric_columns)
print('categorical:',categorical_columns)

age NUMERIC
bp NUMERIC
sg ['1.005', '1.010', '1.015', '1.020', '1.025']
al ['0', '1', '2', '3', '4', '5']
su ['0', '1', '2', '3', '4', '5']
rbc ['normal', 'abnormal']
pc ['normal', 'abnormal']
pcc ['present', 'notpresent']
ba ['present', 'notpresent']
bgr NUMERIC
bu NUMERIC
sc NUMERIC
sod NUMERIC
pot NUMERIC
hemo NUMERIC
pcv NUMERIC
wbcc NUMERIC
rbcc NUMERIC
htn ['yes', 'no']
dm ['yes', 'no']
cad ['yes', 'no']
appet ['good', 'poor']
pe ['yes', 'no']
ane ['yes', 'no']
class ['ckd', 'notckd']

numeric: ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']
categorical: ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']


In [54]:
df = pd.DataFrame(dataset['data'], columns=column_names)

df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1,0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4,0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2,3,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2,0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0,0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0,0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0,0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0,0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


### None values in categorical attributes

In [56]:
for col in categorical_columns:
    print(col, '\t',df[col].unique())
        

sg 	 ['1.020' '1.010' '1.005' '1.015' None '1.025']
al 	 ['1' '4' '2' '3' '0' None '5']
su 	 ['0' '3' '4' '1' None '2' '5']
rbc 	 [None 'normal' 'abnormal']
pc 	 ['normal' 'abnormal' None]
pcc 	 ['notpresent' 'present' None]
ba 	 ['notpresent' 'present' None]
htn 	 ['yes' 'no' None]
dm 	 ['yes' 'no' None]
cad 	 ['no' 'yes' None]
appet 	 ['good' 'poor' None]
pe 	 ['no' 'yes' None]
ane 	 ['no' 'yes' None]
class 	 ['ckd' 'notckd']


### Nan values in numerical attributes


In [57]:
for col in numeric_columns:
    print(col, '\t',np.count_nonzero(df[col].isna().values), '\tNan values',)

age 	 9 	Nan values
bp 	 12 	Nan values
bgr 	 44 	Nan values
bu 	 19 	Nan values
sc 	 17 	Nan values
sod 	 87 	Nan values
pot 	 88 	Nan values
hemo 	 52 	Nan values
pcv 	 71 	Nan values
wbcc 	 106 	Nan values
rbcc 	 131 	Nan values


- wbcc has 106 nan values out of 400 in total (White blood cell count)
- rbcc has 131 nan values out of 400 in total (Red blood cell count)
  

## Exploratory data analysis

In [58]:
df.describe(include='all')

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
count,391.0,388.0,353.0,354.0,351.0,248,335,396,396,356.0,...,329.0,294.0,269.0,398,398,398,399,399,399,400
unique,,,5.0,6.0,6.0,2,2,2,2,,...,,,,2,2,2,2,2,2,2
top,,,1.02,0.0,0.0,normal,normal,notpresent,notpresent,,...,,,,no,no,no,good,no,no,ckd
freq,,,106.0,199.0,290.0,201,259,354,374,,...,,,,251,261,364,317,323,339,250
mean,51.483376,76.469072,,,,,,,,148.036517,...,38.884498,8406.122449,4.707435,,,,,,,
std,17.169714,13.683637,,,,,,,,79.281714,...,8.990105,2944.47419,1.025323,,,,,,,
min,2.0,50.0,,,,,,,,22.0,...,9.0,2200.0,2.1,,,,,,,
25%,42.0,70.0,,,,,,,,99.0,...,32.0,6500.0,3.9,,,,,,,
50%,55.0,80.0,,,,,,,,121.0,...,40.0,8000.0,4.8,,,,,,,
75%,64.5,80.0,,,,,,,,163.0,...,45.0,9800.0,5.4,,,,,,,
