In [125]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
import polars as pl

# Setting the DataFrame up

In [126]:
dataset = fetch_ucirepo(id = 579)
X = dataset.data.features
y = dataset.data.targets['LET_IS']
df = X.join(y)
print(df.shape)
df.head()

(1700, 112)


Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,LET_IS
0,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0
2,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,2.0,...,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0
3,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
4,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0


# Missing Values

In [127]:
np.sum(df.T.isnull().sum()==0)

0

In [128]:
# dropping columns where more than 20% of the values are missing
df_dropped = df.dropna(axis=1, thresh=0.75*len(df))
print(df_dropped.shape)
df_dropped.head()

(1700, 105)


Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,LET_IS
0,77.0,1,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0
2,52.0,1,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0
3,68.0,0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
4,60.0,1,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0


In [129]:
df_shrunk = df_dropped.dropna(axis = 0, thresh = int(0.75*len(df_dropped.columns)))
df_shrunk.shape

(1624, 105)

## Correlation Analysis for Imputation Models

In [130]:
from utils import *

In [131]:
types = data_type_classifier(df_dropped)

In [132]:
categorical = []
binary = []
continuous = []

for key, value in types.items():
    if value == 'Categorical':
        categorical.append(key)
    elif value == 'Binary':
        binary.append(key)
    else:
        continuous.append(key)

print(f'categorical: {categorical}')
print(f'binary: {binary}')
print(f'continuous: {continuous}')

categorical: ['INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'TIME_B_S', 'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n', 'LET_IS']
binary: ['SEX', 'SIM_GIPERT', 'nr_11', 'nr_01', 'nr_02', 'nr_03', 'nr_04', 'nr_07', 'nr_08', 'np_01', 'np_04', 'np_05', 'np_07', 'np_08', 'np_09', 'np_10', 'endocr_01', 'endocr_02', 'endocr_03', 'zab_leg_01', 'zab_leg_02', 'zab_leg_03', 'zab_leg_04', 'zab_leg_06', 'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 'SVT_POST', 'GT_POST', 'FIB_G_POST', 'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_04', 'ritm_ecg_p_06', 'ritm_ecg_p_07', 'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_02', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05', 'n_r_ecg_p_06', 'n_r_ecg_p_08', 'n_r_ecg_p_09', 'n_r_ecg_p_10', 'n_p_ecg_p_01', 'n_p_ecg_p_03', 'n_p_ecg_p_04', 'n_p_ecg_p_05', 'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_08', 'n_p_ecg_p_09', 'n_p_ec

In [143]:
df_cont = df_shrunk[continuous].dropna()
df_cat = df_shrunk[categorical].dropna().drop(['LET_IS'], axis = 1)
df_bin = df_shrunk[binary].dropna()
df_cont.shape, df_cat.shape, df_bin.shape

((896, 9), (1083, 21), (1150, 74))

In [134]:
df_cat

Unnamed: 0,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,DLIT_AG,ZSN_A,ant_im,lat_im,inf_im,...,TIME_B_S,R_AB_1_n,R_AB_2_n,R_AB_3_n,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n
0,2.0,1.0,1.0,2.0,3.0,7.0,0.0,1.0,0.0,0.0,...,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,2.0,2.0,2.0,0.0,4.0,1.0,0.0,...,3.0,3.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,2.0
3,0.0,0.0,0.0,2.0,2.0,3.0,1.0,0.0,1.0,1.0,...,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,2.0,3.0,7.0,0.0,4.0,1.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1680,0.0,4.0,2.0,1.0,2.0,7.0,0.0,4.0,1.0,0.0,...,3.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
1687,2.0,6.0,3.0,1.0,2.0,7.0,0.0,0.0,0.0,4.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1688,1.0,5.0,2.0,2.0,2.0,7.0,2.0,0.0,0.0,2.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1692,0.0,6.0,2.0,2.0,2.0,7.0,0.0,4.0,2.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [135]:
phi_matrix = compute_phi_matrix(df_bin)
phi_matrix



Unnamed: 0,SEX,SIM_GIPERT,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,...,GIPO_K,GIPER_NA,NITR_S,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
SEX,1.000000,-0.073701,0.016584,-0.038317,-0.038328,-0.039845,-0.042783,0.022714,0.004109,-0.011038,...,-0.056557,-0.003915,-0.051450,0.111270,0.081100,-0.029283,0.016922,-0.026379,0.028731,0.056365
SIM_GIPERT,-0.073701,1.000000,0.003169,-0.005379,-0.020960,-0.022334,-0.024867,-0.005379,-0.009325,-0.007610,...,-0.030980,-0.027730,0.018950,-0.038696,0.034660,-0.002032,-0.056434,-0.072461,0.046484,0.013856
nr_11,0.016584,0.003169,1.000000,-0.004660,-0.018161,-0.019350,-0.021545,-0.004660,-0.008079,-0.006594,...,-0.016292,-0.024026,-0.016623,0.038705,-0.026012,0.023428,0.037586,-0.005011,0.019125,0.028415
nr_01,-0.038317,-0.005379,-0.004660,1.000000,-0.003391,-0.003614,-0.004023,-0.000870,-0.001509,-0.001231,...,-0.024482,-0.004487,-0.010023,0.043526,-0.011206,0.020197,0.016815,0.016420,-0.004120,-0.015310
nr_02,-0.038328,-0.020960,-0.018161,-0.003391,1.000000,-0.014082,-0.015679,-0.003391,-0.005879,0.179138,...,-0.079807,-0.017484,0.011268,-0.028412,-0.020578,-0.003503,0.047711,-0.008155,-0.016055,-0.003423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ANT_CA_S_n,-0.029283,-0.002032,0.023428,0.020197,-0.003503,-0.039800,0.037642,0.020197,-0.001558,0.028576,...,-0.027816,-0.021366,-0.085889,-0.013957,-0.155814,1.000000,-0.004313,0.062321,0.027521,-0.005166
GEPAR_S_n,0.016922,-0.056434,0.037586,0.016815,0.047711,0.036322,-0.027934,0.016815,-0.010475,0.023791,...,0.053497,0.005108,0.041017,0.033810,0.027745,-0.004313,1.000000,0.267792,0.005825,-0.020600
ASP_S_n,-0.026379,-0.072461,-0.005011,0.016420,-0.008155,0.017310,-0.031071,0.016420,-0.011651,0.023232,...,0.020523,-0.025472,0.000981,0.029172,0.051139,0.062321,0.267792,1.000000,-0.146343,-0.246692
TIKL_S_n,0.028731,0.046484,0.019125,-0.004120,-0.016055,-0.017107,-0.019047,-0.004120,-0.007142,-0.005829,...,-0.038392,0.021463,-0.005763,-0.012646,0.023447,0.027521,0.005825,-0.146343,1.000000,0.067263


In [144]:
cat_matrix = compute_cramers_v_matrix(df_cat)
cat_matrix

Unnamed: 0,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,DLIT_AG,ZSN_A,ant_im,lat_im,inf_im,...,TIME_B_S,R_AB_1_n,R_AB_2_n,R_AB_3_n,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n
INF_ANAM,1.0,0.233007,0.219212,0.232273,0.102045,0.109527,0.106853,0.111462,0.087784,0.061108,...,0.082591,0.070455,0.111708,0.068809,0.069391,0.088919,0.067439,0.046075,0.074731,0.04392
STENOK_AN,0.233007,1.0,0.508797,0.535182,0.098959,0.164041,0.096816,0.09343,0.082564,0.08619,...,0.092608,0.096597,0.096904,0.105402,0.086257,0.093986,0.106495,0.079231,0.080267,0.058004
FK_STENOK,0.219212,0.508797,1.0,0.540116,0.094853,0.116328,0.075252,0.089113,0.06942,0.069516,...,0.075518,0.06815,0.103232,0.089277,0.047974,0.081097,0.102192,0.060989,0.111288,0.070222
IBS_POST,0.232273,0.535182,0.540116,1.0,0.137934,0.166971,0.088428,0.097055,0.088173,0.10076,...,0.109598,0.063304,0.056915,0.074104,0.054875,0.064436,0.051243,0.04755,0.046345,0.030187
GB,0.102045,0.098959,0.094853,0.137934,1.0,0.555667,0.08477,0.086749,0.098087,0.073398,...,0.079118,0.076868,0.046392,0.079134,0.069171,0.03887,0.046478,0.029855,0.158817,0.060044
DLIT_AG,0.109527,0.164041,0.116328,0.166971,0.555667,1.0,0.079163,0.092881,0.102852,0.076012,...,0.090413,0.079732,0.106242,0.069406,0.114405,0.098822,0.069214,0.096919,0.140728,0.086399
ZSN_A,0.106853,0.096816,0.075252,0.088428,0.08477,0.079163,1.0,0.073769,0.08017,0.067999,...,0.093082,0.050697,0.034127,0.0344,0.040317,0.079989,0.037703,0.072409,0.060772,0.071717
ant_im,0.111462,0.09343,0.089113,0.097055,0.086749,0.092881,0.073769,1.0,0.376634,0.422464,...,0.097476,0.07312,0.073851,0.070911,0.103967,0.081685,0.091545,0.082639,0.083065,0.110352
lat_im,0.087784,0.082564,0.06942,0.088173,0.098087,0.102852,0.08017,0.376634,1.0,0.295153,...,0.08274,0.099562,0.040825,0.057083,0.09486,0.04108,0.055315,0.070075,0.076051,0.10129
inf_im,0.061108,0.08619,0.069516,0.10076,0.073398,0.076012,0.067999,0.422464,0.295153,1.0,...,0.102312,0.061371,0.065755,0.064411,0.073595,0.098084,0.037577,0.050099,0.084642,0.050555


In [147]:
corr_matrix = df_cont.corr('spearman')
corr_matrix

Unnamed: 0,AGE,S_AD_ORIT,D_AD_ORIT,K_BLOOD,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE
AGE,1.0,0.086651,-0.005116,-0.023102,0.016745,-0.091461,-0.063166,0.003954,0.220886
S_AD_ORIT,0.086651,1.0,0.845296,0.040299,0.050511,-0.072187,-0.082165,-0.15774,0.048066
D_AD_ORIT,-0.005116,0.845296,1.0,0.047476,0.041682,-0.047818,-0.067513,-0.158302,0.043198
K_BLOOD,-0.023102,0.040299,0.047476,1.0,0.277257,0.015513,0.043475,0.023055,-0.031673
NA_BLOOD,0.016745,0.050511,0.041682,0.277257,1.0,-0.00109,-0.028839,0.016703,-0.042695
ALT_BLOOD,-0.091461,-0.072187,-0.047818,0.015513,-0.00109,1.0,0.549399,0.018693,0.003325
AST_BLOOD,-0.063166,-0.082165,-0.067513,0.043475,-0.028839,0.549399,1.0,0.080222,-0.023366
L_BLOOD,0.003954,-0.15774,-0.158302,0.023055,0.016703,0.018693,0.080222,1.0,0.015319
ROE,0.220886,0.048066,0.043198,-0.031673,-0.042695,0.003325,-0.023366,0.015319,1.0


In [149]:
bin_corrs = top_correlated_predictors_df(phi_matrix)
bin_corrs

Unnamed: 0,1,2,3,4,5
SEX,endocr_01,ritm_ecg_p_07,endocr_03,LID_S_n,ritm_ecg_p_01
SIM_GIPERT,endocr_01,SEX,ASP_S_n,GEPAR_S_n,n_p_ecg_p_11
nr_11,n_r_ecg_p_05,n_p_ecg_p_10,MP_TP_POST,SVT_POST,ritm_ecg_p_02
nr_01,n_p_ecg_p_08,O_L_POST,endocr_01,ritm_ecg_p_07,ritm_ecg_p_01
nr_02,n_p_ecg_p_04,np_01,fibr_ter_05,n_p_ecg_p_03,np_05
...,...,...,...,...,...
ANT_CA_S_n,B_BLOK_S_n,K_SH_POST,n_p_ecg_p_06,ritm_ecg_p_04,NITR_S
GEPAR_S_n,ASP_S_n,n_r_ecg_p_06,fibr_ter_02,zab_leg_01,n_p_ecg_p_06
ASP_S_n,GEPAR_S_n,TRENT_S_n,TIKL_S_n,K_SH_POST,n_p_ecg_p_06
TIKL_S_n,ASP_S_n,fibr_ter_07,fibr_ter_03,fibr_ter_06,TRENT_S_n


In [156]:
cat_corrs = top_correlated_predictors_df(cat_matrix)
cat_corrs

Unnamed: 0,1,2,3,4,5
INF_ANAM,STENOK_AN,IBS_POST,FK_STENOK,R_AB_2_n,ant_im
STENOK_AN,IBS_POST,FK_STENOK,INF_ANAM,DLIT_AG,NA_R_3_n
FK_STENOK,IBS_POST,STENOK_AN,INF_ANAM,DLIT_AG,NOT_NA_2_n
IBS_POST,FK_STENOK,STENOK_AN,INF_ANAM,DLIT_AG,GB
GB,DLIT_AG,NOT_NA_2_n,IBS_POST,INF_ANAM,STENOK_AN
DLIT_AG,GB,IBS_POST,STENOK_AN,NOT_NA_2_n,FK_STENOK
ZSN_A,INF_ANAM,STENOK_AN,TIME_B_S,IBS_POST,GB
ant_im,inf_im,lat_im,post_im,INF_ANAM,NOT_NA_3_n
lat_im,ant_im,inf_im,post_im,DLIT_AG,NOT_NA_3_n
inf_im,ant_im,lat_im,post_im,TIME_B_S,IBS_POST


In [157]:
cont_corrs = top_correlated_predictors_df(corr_matrix)
cont_corrs

Unnamed: 0,1,2,3,4,5
AGE,ROE,ALT_BLOOD,S_AD_ORIT,AST_BLOOD,K_BLOOD
S_AD_ORIT,D_AD_ORIT,L_BLOOD,AGE,AST_BLOOD,ALT_BLOOD
D_AD_ORIT,S_AD_ORIT,L_BLOOD,AST_BLOOD,ALT_BLOOD,K_BLOOD
K_BLOOD,NA_BLOOD,D_AD_ORIT,AST_BLOOD,S_AD_ORIT,ROE
NA_BLOOD,K_BLOOD,S_AD_ORIT,ROE,D_AD_ORIT,AST_BLOOD
ALT_BLOOD,AST_BLOOD,AGE,S_AD_ORIT,D_AD_ORIT,L_BLOOD
AST_BLOOD,ALT_BLOOD,S_AD_ORIT,L_BLOOD,D_AD_ORIT,AGE
L_BLOOD,D_AD_ORIT,S_AD_ORIT,AST_BLOOD,K_BLOOD,ALT_BLOOD
ROE,AGE,S_AD_ORIT,D_AD_ORIT,NA_BLOOD,K_BLOOD


## Bootstrap

In [175]:
df_cont_bs = df_cont.sample(n= int(len(df_cont)*1.25), replace = True)
df_cat_bs = df_cat.sample(n= int(len(df_cat)*1.25), replace = True)
df_bin_bs = df_bin.sample(n= int(len(df_bin)*1.25), replace = True)
df_cont_bs.shape, df_cat_bs.shape, df_bin_bs.shape

((1120, 9), (1353, 21), (1437, 74))

Columns (Python-Indexed): 
* 0-28: Manish 
* 28-56: Soto
* 56-84: Owen 
* 84-112: Anna 


* binary-categorical 
* categorical-categorical 
* continuous-cateogrical