## Autoencoder for Kidney Disease Detection
The objetive of this proyect is to 

# Preprocessing

In [2]:
##Source: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
#:*
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc,accuracy_score,f1_score,recall_score,precision_score,roc_auc_score,roc_curve,confusion_matrix,ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
#from math import gcd
#from neupy import algorithms

In [3]:
#Reading the content of the dataset and removing the id column
df = pd.read_csv('kidney_disease.csv')
df.drop('id',axis=1,inplace=True)
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [4]:
#Getting the list of categorical and numerical values which number matches the one in the dataset info page
categorical_mask = df.dtypes == object
categorical_columns = df.columns[categorical_mask].tolist()
numerical_columns = df.columns[~categorical_mask].tolist()
print("Number of numeric columns {}: {}".format(len(numerical_columns),numerical_columns))
print("Number of categorical columns {}: {}".format(len(categorical_columns),categorical_columns))

Number of numeric columns 11: ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']
Number of categorical columns 14: ['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']


In [5]:
#Making sure that categorical values don't have any space or tab before or after their values
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.str.strip())

In [6]:
#Checking which how many values are null in each column
df.isnull().sum()

age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [7]:
#Imputing numerical columns using mean as strategy
imputer_num_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_columns] = imputer_num_mean.fit_transform(df[numerical_columns])

#Imputing categorical columns using most_frequent as strategy
imputer_cat_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_columns] = imputer_cat_most_freq.fit_transform(df[categorical_columns])

df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [8]:
#Confirming there are no null values
df.isnull().sum()

age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [9]:
#After reviewing in detail each column, we found that these three are numerical but somehow are marked as categorical
df[['pcv','wc','rc']].describe()

Unnamed: 0,pcv,wc,rc
count,400,400,400.0
unique,43,90,49.0
top,41,9800,5.2
freq,91,116,148.0


In [10]:
#Found some missing values in these columns as '?' so we are imputing those as well and changing the columns to be numerical
df[['pcv','wc','rc']] = df[['pcv','wc','rc']].replace('?',np.nan).astype(float)
imputer_num_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['pcv','wc','rc']] = imputer_num_mean.fit_transform(df[['pcv','wc','rc']])

In [11]:
#After inputing and adding changing it to float
df[['pcv','wc','rc']].describe()

Unnamed: 0,pcv,wc,rc
count,400.0,400.0,400.0
mean,39.255639,8772.932331,4.86792
std,8.190698,2596.80142,0.871448
min,9.0,2200.0,2.1
25%,34.0,6975.0,4.5
50%,41.0,9400.0,5.2
75%,44.0,9800.0,5.2
max,54.0,26400.0,8.0


In [12]:
#Checking that the target variable (classification) is not binary due to two outliers
df['classification'].value_counts()

ckd       250
notckd    150
Name: classification, dtype: int64

In [13]:
#Fixing the issue and making sure the counts are as expected
df['classification'].replace('ckd\t','ckd',inplace=True)
df['classification'].value_counts()

df.to_csv('cleaned_kidney.csv')