In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('chronic_kidney_disease.csv', names=['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', \
                                                    'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class'],
               na_values='?')
df=df[['age']]

In [4]:
replace_dict = {'age': {np.nan: pd.to_numeric(df.age, errors='coerce').mean()}}
df.replace(replace_dict, inplace=True)

In [5]:
bins = [0, 18, 25, 35, 60, 100]
age_bin = pd.cut(df.age, bins, labels=['a','b','c','d','e'])
age_bin

0      d
1      a
2      e
3      d
4      d
      ..
395    d
396    d
397    a
398    a
399    d
Name: age, Length: 400, dtype: category
Categories (5, object): [a < b < c < d < e]

In [6]:
pd.value_counts(age_bin)

d    193
e    132
c     38
a     19
b     18
Name: age, dtype: int64

In [7]:
df['age_bin']=age_bin
df[['age','age_bin']]

Unnamed: 0,age,age_bin
0,48.0,d
1,7.0,a
2,62.0,e
3,48.0,d
4,51.0,d
...,...,...
395,55.0,d
396,42.0,d
397,12.0,a
398,17.0,a


In [8]:
#use Quartile
age_qbin=pd.qcut(df.age, 4)
age_qbin

0       (42.0, 54.0]
1      (1.999, 42.0]
2       (54.0, 64.0]
3       (42.0, 54.0]
4       (42.0, 54.0]
           ...      
395     (54.0, 64.0]
396    (1.999, 42.0]
397    (1.999, 42.0]
398    (1.999, 42.0]
399     (54.0, 64.0]
Name: age, Length: 400, dtype: category
Categories (4, interval[float64]): [(1.999, 42.0] < (42.0, 54.0] < (54.0, 64.0] < (64.0, 90.0]]

In [9]:
pd.value_counts(age_qbin)

(42.0, 54.0]     102
(1.999, 42.0]    102
(64.0, 90.0]      98
(54.0, 64.0]      98
Name: age, dtype: int64

In [10]:
#Smoothing by bin mean method
#define 4 bins (40x10=400)
bin_mean=np.zeros((40,10))
b=df.age.values
b=np.sort(b)

#keep original
df['age_sorted']=df.age.sort_values().values

for i in range(0,400,10):
    k=int(i/10)
    mean=(b[i]+b[i+1]+b[i+2]+b[i+3]+b[i+4]+b[i+5]+b[i+6]+b[i+7]+b[i+8]+b[i+9])/10
    for j in range(10):
        bin_mean[k,j]=mean

df['bin_mean']=bin_mean.reshape(-1,1)

In [69]:
#bin boundaries

bin_bound=np.zeros((40,10))

for i in range(0,400,10):
    k=int(i/10)
    for j in range(10):
        if (b[i+j]-b[i]<b[i+9]-b[i+j]):
            bin_bound[k,j]=b[i]
        else:
            bin_bound[k,j]=b[i+9]
            
df['bin_bound']=bin_bound.reshape(-1,1)

In [72]:
#bin median

bin_median=np.zeros((40,10))

for i in range(0,400,10):
    k=int(i/10)
    for j in range(10):
        bin_median[k,j]=b[i+4]

df['bin_median']=bin_median.reshape(-1,1)

In [73]:
df

Unnamed: 0,age,age_sorted,bin_mean,bin_bound,bin_median
0,48.0,2.0,5.6,2.0,5.0
1,7.0,3.0,5.6,2.0,5.0
2,62.0,4.0,5.6,2.0,5.0
3,48.0,5.0,5.6,8.0,5.0
4,51.0,5.0,5.6,8.0,5.0
...,...,...,...,...,...
395,55.0,80.0,81.3,78.0,80.0
396,42.0,81.0,81.3,78.0,80.0
397,12.0,82.0,81.3,78.0,80.0
398,17.0,83.0,81.3,78.0,80.0
