In [112]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.cluster import rand_score
from ucimlrepo import fetch_ucirepo 

#### 1. Classification Problem Idenitification

Chronic Kidney Disease (CKD) is the progressive loss of kindey function (@sanmarchi2023predict). The progression of the disease can be attributed to a number of factors. The implications of CKD are large, especially for the quality of life of those afflicted as well as for the sustainability of national health systems (@sanmarchi2023predict). Thus, it is important to use the information available about patient health to predict the diagnosis of CKD.

The dataset from the Early Stage of Indians Chronic Kidney Disease (CKD) project (@misc_chronic_kidney_disease_336) contains information on a variety of health factors, including blood pressure, red blood cell count, albumin concentration, blood sugar, whether or not one has diabetes mellitus, and more.

This dataset will be used to predict whether an individual will develop CKD. 

In [113]:
chronic_kidney_disease = fetch_ucirepo(id=336) 
x = chronic_kidney_disease.data.features 
y = chronic_kidney_disease.data.targets 

ckd = pd.DataFrame(x)
target = pd.DataFrame(y)

df = pd.concat([ckd,target], axis = 1)
df = pd.DataFrame(df)

df.head(10)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,...,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,...,36.0,,,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,...,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,...,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


The `ckd` dataframe only contains the variables that we will be using to predict `class`.
The `target` dataframe contains what we are predicting (`class`).

### 2. Variable Transformation

In [114]:
df.dtypes

age      float64
bp       float64
sg       float64
al       float64
su       float64
rbc       object
pc        object
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
class     object
dtype: object

Based on the variable types, there are some variables that are numerical, while others are categorical variables. 

In terms of variable transformations, we need to scale and standardize all continuous variables (float64) and also convert categorical variables to numerical. This should be done after removing missing values.

Any other variable transformations should not be applied in this case as the data set's numerical variables hold meaning as their original values.

In [111]:
cat_columns = df.select_dtypes(
    include=['object']
    ).columns

for col in cat_columns:
    df[col] = df[col].astype('category').cat.codes

Now, all variables which were objects are now categorical variables.

### 3. Dataset Overview

In [116]:
df.shape

(400, 25)

In [117]:
df.dtypes

age      float64
bp       float64
sg       float64
al       float64
su       float64
rbc       object
pc        object
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
class     object
dtype: object

In [118]:
df.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0,329.0,294.0,269.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437,38.884498,8406.122449,4.707435
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587,8.990105,2944.47419,1.025323
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3,32.0,6500.0,3.9
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65,40.0,8000.0,4.8
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0,45.0,9800.0,5.4
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [119]:
df.describe(include=object)

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
count,248,335,396,396,398,398,398,399,399,399,400
unique,2,2,2,2,2,3,2,2,2,2,3
top,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
freq,201,259,354,374,251,260,364,317,323,339,248


In [120]:
df.dm.value_counts()

dm
no      260
yes     137
\tno      1
Name: count, dtype: int64

In [130]:
dm = df.dm
dm = dm.str.replace('\tno','no')
dm.value_counts()

df.dm = dm
df.dm.value_counts()

dm
no     261
yes    137
Name: count, dtype: int64

In [None]:
plt.hist(Hitters['Salary'], bins=10, edgecolor='black')
plt.title('Histogram of Salary')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()

There are 400 observations and 24 variables, out of which 14 are numerical and 10 are nominal (listed above). The `dm` variable, which indicates whether an individual has diabetes mellitus, has an error term where instead of just 'no', the entry is '\tno' so we've corrected that by changing it back to 'no'. There are also many missing values in many variable categories, which is why all of the variable counts are not equal to 400.

### 4. Association between Variables

M: 4,6,7
N: 2,3,5