In [None]:
# import libraries

import os

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
os.chdir("res")

In [None]:
# load dataset
df = pd.read_csv('kidney_disease.csv')
df.drop(columns=['id'],axis=1,inplace=True)

In [None]:
df.columns

In [None]:
df.columns = ['age', 'blood_pressure', 'spesific_gravity', 'albumin', 'sugar',
              'red_blood_cell', 'puss_cell', 'puss_cell_clumbs', 'bacteria', 
              'blood_glucose_random', 'blood_urea','serum_creatine', 'sodium',
              'potassium', 'hemoglobin', 'packet_cell_volume', 'white_blood_cell_count',
              'red_blood_cell_count', 'hyper_tension', 'diabetes_mellitus',
              'coronary_artert_disease','appetite', 'peda_edema', 'anemia', 'class']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
def convert2numeric(col):
    df[col] = pd.to_numeric(df[col], errors='coerce')
    # coerce : if can't convert, value to nan

to_numeric_list = ['packet_cell_volume','white_blood_cell_count','red_blood_cell_count']

for col in to_numeric_list:
    convert2numeric(col)

In [None]:
# EDA : KDE
cat_columns = [col for col in df.columns if df[col].dtype == 'object']
num_columns = [col for col in df.columns if df[col].dtype != 'object']

for col in cat_columns:
    print(f"{col} : {df[col].unique()}")
    
"""
diabetes_mellitus : ['yes' 'no' ' yes' '\tno' '\tyes' nan]
coronary_artert_disease : ['no' 'yes' '\tno' nan]
class : ['ckd' 'ckd\t' 'notckd']
"""    

clean_dict = {' yes' : 'yes',
                 '\tno' : 'no',
                 '\tyes' : 'yes',
                 'ckd\t' : 'ckd' }

def normalize_data(col):
    df[col].replace(clean_dict,inplace=True) 

normalize_data('class')    
normalize_data('diabetes_mellitus')
normalize_data('coronary_artert_disease')


In [None]:
df['class']

In [None]:
# mapping
df['class'] = df['class'].map({'notckd' : 0, 'ckd' : 1})
df


In [None]:
plt.figure(figsize=(15,15))
num_plot = 1

for col in num_columns:
    if num_plot <= len(num_columns):
        ax = plt.subplot(3,5,num_plot)
        sns.histplot(df[col])
        plt.xlabel(col)
        
        num_plot += 1
plt.tight_layout()
plt.show()
    

In [None]:
df['class']

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.select_dtypes(include=['number']).corr(), annot=True, linewidths=2, linecolor='white', cbar=False)
plt.title("CORRELATION HEAT MAP")
plt.show()
# hemoglobin = 0.77 
# packet cell volume = 0.74
# spesific gravity = 0.73
# red blood cell count = 0.7
# albumin = -0.63


In [None]:
def draw_kde(col): # kernel density estimation
    grid = sns.FacetGrid(df, hue='class', height=6, aspect=2)
    grid.map(sns.kdeplot, col)
    grid.add_legend()

draw_kde('hemoglobin')    

In [None]:
draw_kde('packet_cell_volume')

In [None]:
draw_kde('spesific_gravity')

In [None]:
draw_kde('red_blood_cell_count')

In [None]:
draw_kde('albumin') # too many nan value