# Building a CKD Classifier for IgAN Patients

### Steps To Building Model
1. Load Data
2. Clean Data
3. Split Data
4. Build Model
5. Export cleaned data

### Loading Data
To perform this step, the CKD dataset must be downloaded and moved to the working directory

The dataset can be found here ---> http://archive.ics.uci.edu/ml//datasets/Chronic_Kidney_Disease

In [1]:
import arff
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

data_path = 'Chronic_Kidney_Disease/chronic_kidney_disease.arff'

with open(data_path, 'r') as file:
    data = file.read()
    
# remove double commas and trailing commas
data = data.replace(',,', ',');
data = data.replace(',\n', '\n');
    

with open(data_path, 'w') as file:
    file.write(data)


In [2]:
data = arff.load(open(data_path, 'r'))

ckd_df = pd.DataFrame(data['data'])

ckd_df.columns = [x[0] for x in data['attributes']]
ckd_df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1,0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4,0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2,3,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2,0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


### Convert Data to numerical representation


In [3]:
class_columns = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']

for col in class_columns:
    ckd_df[col] = pd.factorize(ckd_df[col])[0]


ckd_df = ckd_df.fillna(0)

ckd_df_filtered = ckd_df[['age','bp','sg','al','su','bgr','sc','sod','hemo', 'htn','class']]
ckd_df_simplified = ckd_df[['age','bp','sg','sc', 'htn','class']]

ckd_df_filtered.head()

Unnamed: 0,age,bp,sg,al,su,bgr,sc,sod,hemo,htn,class
0,48.0,80.0,1.02,1,0,121.0,1.2,0.0,15.4,0,0
1,7.0,50.0,1.02,4,0,0.0,0.8,0.0,11.3,1,0
2,62.0,80.0,1.01,2,3,423.0,1.8,0.0,9.6,1,0
3,48.0,70.0,1.005,4,0,117.0,3.8,111.0,11.2,0,0
4,51.0,80.0,1.01,2,0,106.0,1.4,0.0,11.6,1,0


In [4]:
ckd_df_filtered['hemo'].astype(float).min()


0.0

### Split Data into Training and Test Data

In [5]:
from sklearn.model_selection import train_test_split
test_size = 0.3
train, test = train_test_split(ckd_df_filtered, test_size=test_size)

In [6]:
x_train = train.iloc[:,:-1].to_numpy()
y_train = train.iloc[:,-1].to_numpy()

x_test = test.iloc[:,:-1].to_numpy()
y_test = test.iloc[:,-1].to_numpy()

x_train = x_train.astype('float32')
y_train = y_train.astype('float32')
x_test = x_test.astype('float32')
y_test = y_test.astype('float32')



In [7]:
y_train.shape

(280,)

### Build Random Forest Classifier

In [8]:
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize


rf_clf = RandomForestClassifier()

rf_clf.fit(x_train, y_train)

y_predict_train = rf_clf.predict(x_train)
y_predict_test = rf_clf.predict(x_test)





In [9]:

train_accuracy = accuracy_score(y_train, y_predict_train)


test_accuracy = accuracy_score(y_test, y_predict_test)


feature_importance = rf_clf.feature_importances_

feature_importance = rf_clf.feature_importances_
sorted_indices = np.argsort(feature_importance)



parameters = {
    'max_depth': [2, 8, 16],
    'n_estimators': [4, 16, 256]
}
gscv_rfc = GridSearchCV(rf_clf, param_grid=parameters)
gscv_rfc.fit(x_train, y_train)


best_params = gscv_rfc.best_params_




best_score = gscv_rfc.best_score_


### Get accuracy from test dataset

In [10]:
# from parameter tuning
best_score

0.9964285714285716