In [111]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  classification_report,  ConfusionMatrixDisplay
from matplotlib.pylab import rcParams
#!pip install liac-arff
import arff

## Preprocessing

### Replacing strings in the arff and creating a df

Especially this line is important:

`line.replace('	', '').replace(',,',',').replace('ckd,\n', 'ckd\n').replace(' yes','yes')`

otherwise, the file could not be opened

In [112]:
fin = open("data/chronic_kidney_disease_full.arff", "rt")
#output file to write the result to
fout = open("preprocessed/chronic_kidney_disease_full.arff", "wt")
#for each line in the input file
for line in fin:
	#read replace the string and write to output file
	fout.write(line.replace('	', '').replace(',,',',').replace('ckd,\n', 'ckd\n').replace(' ckd', 'ckd').replace(' yes','yes'))
#close input and output files
fin.close()

dataset = arff.load(open("preprocessed/chronic_kidney_disease_full.arff", "r"))
print(dataset['description'])

1. Title: Early stage of Indians Chronic Kidney Disease(CKD)

2. Source Information:
  (a) Source:
Dr.P.Soundarapandian.M.D.,D.M
    (Senior Consultant Nephrologist),
Apollo  Hospitals,
Managiri,
Madurai Main Road,
Karaikudi,
Tamilnadu,
India.
  (b) Creator:
L.Jerlin Rubini(Research Scholar)
Alagappa University
EmailId   :jel.jerlin@gmail.com
ContactNo :+91-9597231281
  (c) Guided by:
Dr.P.Eswaran Assistant Professor,
Department of Computer Science and Engineering,
Alagappa University,
Karaikudi,
Tamilnadu,
India.
Emailid:eswaranperumal@gmail.com
  (d) Date     : july 2015

3.Relevant Information:
age-age
bp-blood pressure
sg-specific gravity
al-   albumin
su-sugar
rbc-red blood cells
pc-pus cell
pcc-pus cell clumps
ba-bacteria
bgr-blood glucose random
bu-blood urea
sc-serum creatinine
sod-sodium
pot-potassium
hemo-hemoglobin
pcv-packed cell volume
wc-white blood cell count
rc-red blood cell count
htn-hypertension
dm-diabetes mellitus
cad-coronary artery disease
appet-appetite
pe-pedal

In [113]:
numeric_columns = []
categorical_columns = []
column_names = []

for idx, attr in enumerate(dataset['attributes']):
    column_names.append(attr[0])
    if attr[1] == 'NUMERIC':
        numeric_columns.append(attr[0])
    else:
        categorical_columns.append(attr[0])
    print(attr[0], attr[1])
    

print('\nnumeric:',numeric_columns)
print('categorical:',categorical_columns)

age NUMERIC
bp NUMERIC
sg ['1.005', '1.010', '1.015', '1.020', '1.025']
al ['0', '1', '2', '3', '4', '5']
su ['0', '1', '2', '3', '4', '5']
rbc ['normal', 'abnormal']
pc ['normal', 'abnormal']
pcc ['present', 'notpresent']
ba ['present', 'notpresent']
bgr NUMERIC
bu NUMERIC
sc NUMERIC
sod NUMERIC
pot NUMERIC
hemo NUMERIC
pcv NUMERIC
wbcc NUMERIC
rbcc NUMERIC
htn ['yes', 'no']
dm ['yes', 'no']
cad ['yes', 'no']
appet ['good', 'poor']
pe ['yes', 'no']
ane ['yes', 'no']
class ['ckd', 'notckd']

numeric: ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']
categorical: ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']


In [114]:
df = pd.DataFrame(dataset['data'], columns=column_names)
df.to_csv('preprocessed/df_raw.csv', index=False)  
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1,0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4,0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2,3,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2,0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0,0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0,0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0,0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0,0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [115]:
def encode(df: pd.DataFrame):
    dataframe = df.copy()
    dataframe.replace('yes', True, inplace=True) # ['htn', 'dm', 'cad', 'pe', 'ane']
    dataframe.replace('no', False, inplace=True) # ['htn', 'dm', 'cad', 'pe', 'ane']

    dataframe.replace('present', True, inplace=True) # ['pcc', 'ba']
    dataframe.replace('notpresent', False, inplace=True) # ['pcc', 'ba']

    dataframe.replace('good', True, inplace=True) # ['appet']
    dataframe.replace('poor', False, inplace=True) # ['appet']

    dataframe.replace('ckd', True, inplace=True) # ['class']
    dataframe.replace('notckd', False, inplace=True) # ['class']
    
    dataframe.replace('normal', True, inplace=True) # ['rbc', 'pc']
    dataframe.replace('abnormal', False, inplace=True) # ['rbc', 'pc']
    
    # dataframe['sg'].replace('1.005', 'A: 1.005', inplace=True)
    # dataframe['sg'].replace('1.010', 'B: 1.010', inplace=True)
    # dataframe['sg'].replace('1.015', 'C: 1.015', inplace=True)
    # dataframe['sg'].replace('1.020', 'D: 1.020', inplace=True)
    # dataframe['sg'].replace('1.025', 'E: 1.025', inplace=True)
    
    # dataframe['al'].replace('0', 'A: 0', inplace=True)
    # dataframe['al'].replace('1', 'B: 1', inplace=True)
    # dataframe['al'].replace('2', 'C: 2', inplace=True)
    # dataframe['al'].replace('3', 'D: 3', inplace=True)
    # dataframe['al'].replace('4', 'E: 4', inplace=True)
    
    # dataframe['su'].replace('0', 'A: 0', inplace=True)
    # dataframe['su'].replace('1', 'B: 1', inplace=True)
    # dataframe['su'].replace('2', 'C: 2', inplace=True)
    # dataframe['su'].replace('3', 'D: 3', inplace=True)
    # dataframe['su'].replace('4', 'E: 4', inplace=True)
    
    return dataframe

df_encoded = encode(df)
df_encoded


def print_range_of_values(dataframe):
    for col in dataframe.columns:
        print(col, '\t',dataframe[col].unique())


print_range_of_values(df_encoded[categorical_columns])

sg 	 ['1.020' '1.010' '1.005' '1.015' None '1.025']
al 	 ['1' '4' '2' '3' '0' None '5']
su 	 ['0' '3' '4' '1' None '2' '5']
rbc 	 [None True False]
pc 	 [True False None]
pcc 	 [False True None]
ba 	 [False True None]
htn 	 [True False None]
dm 	 [True False None]
cad 	 [False True None]
appet 	 [True False None]
pe 	 [False True None]
ane 	 [False True None]
class 	 [ True False]


### Exporting the df to csv
We need to make sure that the categorical columns are not confused with numeric columns. Therefore, we provide datatypes for the columns.

In [116]:
import csv

df_encoded.to_csv('preprocessed/df.csv', index=False)

df_check = pd.read_csv('preprocessed/df.csv')
df_check

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,,True,False,False,121.0,...,44.0,7800.0,5.2,True,True,False,True,False,False,True
1,7.0,50.0,1.020,4.0,0.0,,True,False,False,,...,38.0,6000.0,,False,False,False,True,False,False,True
2,62.0,80.0,1.010,2.0,3.0,True,True,False,False,423.0,...,31.0,7500.0,,False,True,False,False,False,True,True
3,48.0,70.0,1.005,4.0,0.0,True,False,True,False,117.0,...,32.0,6700.0,3.9,True,False,False,False,True,True,True
4,51.0,80.0,1.010,2.0,0.0,True,True,False,False,106.0,...,35.0,7300.0,4.6,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,True,True,False,False,140.0,...,47.0,6700.0,4.9,False,False,False,True,False,False,False
396,42.0,70.0,1.025,0.0,0.0,True,True,False,False,75.0,...,54.0,7800.0,6.2,False,False,False,True,False,False,False
397,12.0,80.0,1.020,0.0,0.0,True,True,False,False,100.0,...,49.0,6600.0,5.4,False,False,False,True,False,False,False
398,17.0,60.0,1.025,0.0,0.0,True,True,False,False,114.0,...,51.0,7200.0,5.9,False,False,False,True,False,False,False
