In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer

In [2]:
qual_cols = { "age", "TSH", "T3", "TT4", "T4U", "FTI", "TBG" }
cat_cols = { "sex", "on_thyroxine", "on_antithyroid_meds", "sick", "pregnant", "thyroid_surgery", "I131_treatment", "lithium", "goitre", "tumor", "psych", "referral_source", "target" }

In [3]:
remove_qual_cols = { "TBG" }
remove_cat_cols = set()

In [4]:
k = 10
neighbours = 2
iters = 10000

In [5]:
seed = 42

In [6]:
random_state = np.random.RandomState(seed)
random_generator = np.random.default_rng(seed)

In [7]:
sel_qual_cols = list(qual_cols.difference(remove_qual_cols))
sel_cat_cols = list(cat_cols.difference(remove_cat_cols))
display(sel_qual_cols)
display(sel_cat_cols)

['age', 'T4U', 'TT4', 'TSH', 'FTI', 'T3']

['tumor',
 'referral_source',
 'pregnant',
 'I131_treatment',
 'sick',
 'lithium',
 'on_antithyroid_meds',
 'psych',
 'on_thyroxine',
 'thyroid_surgery',
 'goitre',
 'sex',
 'target']

In [8]:
df = pd.read_csv("./data/thyroidDF.csv")
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027


In [9]:
df.dropna()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
167,40,F,f,f,f,f,f,f,f,f,...,3.9,t,0.83,t,5.0,t,28.0,other,F,840827019
5256,35,F,f,f,f,f,f,t,f,f,...,73.0,t,1.16,t,63.0,t,37.0,other,-,851128040
6044,77,F,f,f,f,f,f,f,f,f,...,120.0,t,0.96,t,124.0,t,45.0,SVI,-,860305064
6045,73,M,f,f,f,f,f,f,f,f,...,89.0,t,0.74,t,119.0,t,24.0,SVI,-,860305065
6747,77,F,f,f,f,f,f,f,f,f,...,131.0,t,1.04,t,126.0,t,25.0,SVI,K,860702030
6773,74,F,f,f,f,f,f,f,f,f,...,116.0,t,0.81,t,143.0,t,22.0,SVI,-,860703046
6862,60,M,f,f,f,f,f,f,f,f,...,92.0,t,0.84,t,110.0,t,21.0,other,-,860710043
6863,66,F,f,f,f,f,f,f,f,f,...,138.0,t,0.8,t,173.0,t,15.0,SVI,-,860710044
6880,42,F,f,f,f,f,f,f,f,f,...,106.0,t,0.98,t,108.0,t,27.0,other,-,860711039
6934,29,F,f,f,f,f,f,f,f,f,...,122.0,t,1.14,t,107.0,t,36.0,SVI,-,860717007


In [10]:
df[sel_qual_cols]

Unnamed: 0,age,T4U,TT4,TSH,FTI,T3
0,29,,,0.3,,
1,29,,128.0,1.6,,1.9
2,41,,,,,
3,36,,,,,
4,32,,,,,
...,...,...,...,...,...,...
9167,56,0.83,64.0,,77.0,
9168,22,0.92,91.0,,99.0,
9169,69,1.27,113.0,,89.0,
9170,47,0.85,75.0,,88.0,


In [11]:
df[sel_cat_cols]

Unnamed: 0,tumor,referral_source,pregnant,I131_treatment,sick,lithium,on_antithyroid_meds,psych,on_thyroxine,thyroid_surgery,goitre,sex,target
0,f,other,f,f,f,f,f,f,f,f,f,F,-
1,f,other,f,f,f,f,f,f,f,f,f,F,-
2,f,other,f,f,f,f,f,f,f,f,f,F,-
3,f,other,f,f,f,f,f,f,f,f,f,F,-
4,f,other,f,f,f,f,f,f,f,f,f,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,f,SVI,f,f,f,f,f,f,f,f,f,M,-
9168,f,SVI,f,f,f,f,f,f,f,f,f,M,-
9169,f,SVI,f,f,f,f,f,f,f,f,f,M,I
9170,f,other,f,f,f,f,f,f,f,f,f,F,-


In [12]:
df = df[(df["age"] >= 0) & (df["age"] < 100)]
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027


In [13]:
qual_na_df = df[sel_qual_cols].dropna()
qual_na_df

Unnamed: 0,age,T4U,TT4,TSH,FTI,T3
19,36,1.06,90.0,1.50,85.0,2.4
21,40,1.08,104.0,1.20,96.0,2.3
22,40,0.84,88.0,5.90,105.0,2.1
23,77,1.13,107.0,0.05,95.0,2.4
27,51,0.87,93.0,0.05,106.0,2.1
...,...,...,...,...,...,...
9129,65,0.90,85.0,0.73,94.0,1.8
9130,65,1.19,135.0,4.10,113.0,2.1
9134,74,1.25,49.0,53.00,39.0,1.0
9137,42,0.73,59.0,2.30,81.0,1.3


In [14]:
from data_split import k_fold_split

In [15]:
complete_vals_df, remove_vals_df = k_fold_split(qual_na_df, k, random_generator)
missing_vals_idxs = list(remove_vals_df.index)

real_df = pd.concat([complete_vals_df, remove_vals_df])

In [16]:
knn_imputed_df_map = dict()

for col in sel_qual_cols:
    missing_vals_df = remove_vals_df.copy()
    missing_vals_df[col] = np.NaN

    curr_df = pd.concat([complete_vals_df, missing_vals_df])

    knn_imputer = KNNImputer(n_neighbors=neighbours)
    knn_imputed_mat = knn_imputer.fit_transform(curr_df)
    
    knn_imputed_df = pd.DataFrame(knn_imputed_mat, columns=curr_df.columns, index=curr_df.index)
    knn_imputed_df["{} (real)".format(col)] = real_df[col]
    knn_imputed_df["{} (imputed)".format(col)] = knn_imputed_df[col]
    knn_imputed_df.drop([col], axis=1, inplace=True)

    knn_imputed_df = knn_imputed_df.loc[missing_vals_idxs]

    knn_imputed_df_map[col] = knn_imputed_df

In [17]:
def col_mse(df : pd.DataFrame, col : str) -> float:
    sqr_err_list = (df[col + " (real)"] - df[col + " (imputed)"]).to_numpy() ** 2
    sqr_err_sum = sqr_err_list.sum()
    mean_sqr_err = sqr_err_sum / sqr_err_list.size
    return mean_sqr_err

In [18]:
for col, knn_imputed_df in knn_imputed_df_map.items():
    display("{} with KNN".format(col))
    display(col_mse(knn_imputed_df, col))
    display(knn_imputed_df)

'age with KNN'

499.8900501672241

Unnamed: 0,T4U,TT4,TSH,FTI,T3,age (real),age (imputed)
4257,0.78,71.0,1.900,91.0,1.5,81.0,71.0
6728,1.06,109.0,4.700,103.0,3.0,20.0,64.5
460,1.07,98.0,2.100,92.0,2.2,59.0,55.0
1578,0.74,90.0,0.150,121.0,1.1,53.0,44.5
7342,1.22,120.0,0.660,98.0,2.2,22.0,36.5
...,...,...,...,...,...,...,...
2387,1.00,114.0,6.000,114.0,1.9,41.0,16.5
6722,1.00,131.0,0.035,131.0,4.0,46.0,56.0
7182,1.01,69.0,1.500,68.0,2.9,47.0,33.5
3777,1.24,144.0,11.000,116.0,1.5,55.0,54.5


'T4U with KNN'

0.0026514632107023416

Unnamed: 0,age,TT4,TSH,FTI,T3,T4U (real),T4U (imputed)
4257,81.0,71.0,1.900,91.0,1.5,0.78,0.760
6728,20.0,109.0,4.700,103.0,3.0,1.06,1.050
460,59.0,98.0,2.100,92.0,2.2,1.07,1.065
1578,53.0,90.0,0.150,121.0,1.1,0.74,0.760
7342,22.0,120.0,0.660,98.0,2.2,1.22,1.225
...,...,...,...,...,...,...,...
2387,41.0,114.0,6.000,114.0,1.9,1.00,1.015
6722,46.0,131.0,0.035,131.0,4.0,1.00,1.005
7182,47.0,69.0,1.500,68.0,2.9,1.01,1.010
3777,55.0,144.0,11.000,116.0,1.5,1.24,1.230


'TT4 with KNN'

495.21086956521737

Unnamed: 0,age,T4U,TSH,FTI,T3,TT4 (real),TT4 (imputed)
4257,81.0,0.78,1.900,91.0,1.5,71.0,92.5
6728,20.0,1.06,4.700,103.0,3.0,109.0,113.0
460,59.0,1.07,2.100,92.0,2.2,98.0,96.5
1578,53.0,0.74,0.150,121.0,1.1,90.0,100.0
7342,22.0,1.22,0.660,98.0,2.2,120.0,110.5
...,...,...,...,...,...,...,...
2387,41.0,1.00,6.000,114.0,1.9,114.0,130.5
6722,46.0,1.00,0.035,131.0,4.0,131.0,131.0
7182,47.0,1.01,1.500,68.0,2.9,69.0,62.0
3777,55.0,1.24,11.000,116.0,1.5,144.0,107.5


'TSH with KNN'

727.232608400903

Unnamed: 0,age,T4U,TT4,FTI,T3,TSH (real),TSH (imputed)
4257,81.0,0.78,71.0,91.0,1.5,1.900,1.435
6728,20.0,1.06,109.0,103.0,3.0,4.700,0.715
460,59.0,1.07,98.0,92.0,2.2,2.100,2.050
1578,53.0,0.74,90.0,121.0,1.1,0.150,1.300
7342,22.0,1.22,120.0,98.0,2.2,0.660,1.325
...,...,...,...,...,...,...,...
2387,41.0,1.00,114.0,114.0,1.9,6.000,0.675
6722,46.0,1.00,131.0,131.0,4.0,0.035,0.850
7182,47.0,1.01,69.0,68.0,2.9,1.500,8.050
3777,55.0,1.24,144.0,116.0,1.5,11.000,0.605


'FTI with KNN'

681.4953887882107

Unnamed: 0,age,T4U,TT4,TSH,T3,FTI (real),FTI (imputed)
4257,81.0,0.78,71.0,1.900,1.5,91.0,94.5
6728,20.0,1.06,109.0,4.700,3.0,103.0,128.0
460,59.0,1.07,98.0,2.100,2.2,92.0,118.5
1578,53.0,0.74,90.0,0.150,1.1,121.0,95.5
7342,22.0,1.22,120.0,0.660,2.2,98.0,125.5
...,...,...,...,...,...,...,...
2387,41.0,1.00,114.0,6.000,1.9,114.0,119.5
6722,46.0,1.00,131.0,0.035,4.0,131.0,119.0
7182,47.0,1.01,69.0,1.500,2.9,68.0,75.0
3777,55.0,1.24,144.0,11.000,1.5,116.0,116.5


'T3 with KNN'

0.5726535535117057

Unnamed: 0,age,T4U,TT4,TSH,FTI,T3 (real),T3 (imputed)
4257,81.0,0.78,71.0,1.900,91.0,1.5,0.90
6728,20.0,1.06,109.0,4.700,103.0,3.0,2.50
460,59.0,1.07,98.0,2.100,92.0,2.2,2.50
1578,53.0,0.74,90.0,0.150,121.0,1.1,1.95
7342,22.0,1.22,120.0,0.660,98.0,2.2,2.80
...,...,...,...,...,...,...,...
2387,41.0,1.00,114.0,6.000,114.0,1.9,2.15
6722,46.0,1.00,131.0,0.035,131.0,4.0,1.90
7182,47.0,1.01,69.0,1.500,68.0,2.9,1.70
3777,55.0,1.24,144.0,11.000,116.0,1.5,2.70


In [19]:
mice_imputed_df_map = dict()

for col in sel_qual_cols:
    missing_vals_df = remove_vals_df.copy()
    missing_vals_df[col] = np.NaN

    curr_df = pd.concat([complete_vals_df, missing_vals_df])

    mice_imputer = IterativeImputer(random_state=random_state, max_iter=iters)
    mice_imputed_mat = mice_imputer.fit_transform(curr_df)
    
    mice_imputed_df = pd.DataFrame(mice_imputed_mat, columns=curr_df.columns, index=curr_df.index)
    mice_imputed_df["{} (real)".format(col)] = real_df[col]
    mice_imputed_df["{} (imputed)".format(col)] = mice_imputed_df[col]
    mice_imputed_df.drop([col], axis=1, inplace=True)

    mice_imputed_df = mice_imputed_df.loc[missing_vals_idxs]

    mice_imputed_df_map[col] = mice_imputed_df

In [20]:
for col, mice_imputed_df in mice_imputed_df_map.items():
    display("{} with MICE".format(col))
    display(col_mse(mice_imputed_df, col))
    display(mice_imputed_df)

'age with MICE'

320.67448465442675

Unnamed: 0,T4U,TT4,TSH,FTI,T3,age (real),age (imputed)
4257,0.78,71.0,1.900,91.0,1.5,81.0,55.682296
6728,1.06,109.0,4.700,103.0,3.0,20.0,46.532684
460,1.07,98.0,2.100,92.0,2.2,59.0,50.695552
1578,0.74,90.0,0.150,121.0,1.1,53.0,59.565401
7342,1.22,120.0,0.660,98.0,2.2,22.0,50.778305
...,...,...,...,...,...,...,...
2387,1.00,114.0,6.000,114.0,1.9,41.0,53.891782
6722,1.00,131.0,0.035,131.0,4.0,46.0,42.359737
7182,1.01,69.0,1.500,68.0,2.9,47.0,45.312699
3777,1.24,144.0,11.000,116.0,1.5,55.0,55.965345


'T4U with MICE'

0.006545047287161668

Unnamed: 0,age,TT4,TSH,FTI,T3,T4U (real),T4U (imputed)
4257,81.0,71.0,1.900,91.0,1.5,0.78,0.840587
6728,20.0,109.0,4.700,103.0,3.0,1.06,1.069861
460,59.0,98.0,2.100,92.0,2.2,1.07,1.024684
1578,53.0,90.0,0.150,121.0,1.1,0.74,0.795682
7342,22.0,120.0,0.660,98.0,2.2,1.22,1.136618
...,...,...,...,...,...,...,...
2387,41.0,114.0,6.000,114.0,1.9,1.00,1.008499
6722,46.0,131.0,0.035,131.0,4.0,1.00,1.071782
7182,47.0,69.0,1.500,68.0,2.9,1.01,0.995442
3777,55.0,144.0,11.000,116.0,1.5,1.24,1.170115


'TT4 with MICE'

80.60998151280157

Unnamed: 0,age,T4U,TSH,FTI,T3,TT4 (real),TT4 (imputed)
4257,81.0,0.78,1.900,91.0,1.5,71.0,71.465718
6728,20.0,1.06,4.700,103.0,3.0,109.0,110.626619
460,59.0,1.07,2.100,92.0,2.2,98.0,103.686382
1578,53.0,0.74,0.150,121.0,1.1,90.0,88.085211
7342,22.0,1.22,0.660,98.0,2.2,120.0,123.057404
...,...,...,...,...,...,...,...
2387,41.0,1.00,6.000,114.0,1.9,114.0,110.868193
6722,46.0,1.00,0.035,131.0,4.0,131.0,128.669814
7182,47.0,1.01,1.500,68.0,2.9,69.0,80.383103
3777,55.0,1.24,11.000,116.0,1.5,144.0,136.958017


'TSH with MICE'

1052.0831210431768

Unnamed: 0,age,T4U,TT4,FTI,T3,TSH (real),TSH (imputed)
4257,81.0,0.78,71.0,91.0,1.5,1.900,6.352127
6728,20.0,1.06,109.0,103.0,3.0,4.700,5.963291
460,59.0,1.07,98.0,92.0,2.2,2.100,9.116387
1578,53.0,0.74,90.0,121.0,1.1,0.150,4.055338
7342,22.0,1.22,120.0,98.0,2.2,0.660,9.777455
...,...,...,...,...,...,...,...
2387,41.0,1.00,114.0,114.0,1.9,6.000,4.581343
6722,46.0,1.00,131.0,131.0,4.0,0.035,-3.002942
7182,47.0,1.01,69.0,68.0,2.9,1.500,12.297544
3777,55.0,1.24,144.0,116.0,1.5,11.000,4.922389


'FTI with MICE'

60.302913426982364

Unnamed: 0,age,T4U,TT4,TSH,T3,FTI (real),FTI (imputed)
4257,81.0,0.78,71.0,1.900,1.5,91.0,97.192804
6728,20.0,1.06,109.0,4.700,3.0,103.0,105.781555
460,59.0,1.07,98.0,2.100,2.2,92.0,90.006363
1578,53.0,0.74,90.0,0.150,1.1,121.0,121.791604
7342,22.0,1.22,120.0,0.660,2.2,98.0,93.950516
...,...,...,...,...,...,...,...
2387,41.0,1.00,114.0,6.000,1.9,114.0,116.104030
6722,46.0,1.00,131.0,0.035,4.0,131.0,139.369168
7182,47.0,1.01,69.0,1.500,2.9,68.0,68.666070
3777,55.0,1.24,144.0,11.000,1.5,116.0,116.029337


'T3 with MICE'

0.45845384208965384

Unnamed: 0,age,T4U,TT4,TSH,FTI,T3 (real),T3 (imputed)
4257,81.0,0.78,71.0,1.900,91.0,1.5,1.125176
6728,20.0,1.06,109.0,4.700,103.0,3.0,2.321855
460,59.0,1.07,98.0,2.100,92.0,2.2,1.871235
1578,53.0,0.74,90.0,0.150,121.0,1.1,1.589321
7342,22.0,1.22,120.0,0.660,98.0,2.2,2.563694
...,...,...,...,...,...,...,...
2387,41.0,1.00,114.0,6.000,114.0,1.9,2.136510
6722,46.0,1.00,131.0,0.035,131.0,4.0,2.288366
7182,47.0,1.01,69.0,1.500,68.0,2.9,1.608653
3777,55.0,1.24,144.0,11.000,116.0,1.5,2.506762


In [21]:
from proba_utils import cum_sum_intervals_from_weights, sample_cols, create_nan_vals

In [22]:
weight_map = {
    "age" : 1,
    "T3"  : 1,
    "T4U" : 1,
    "TSH" : 1,
    "TT4" : 1,
    "FTI" : 1
}

cum_sum_intervals, cum_sum_col_map = cum_sum_intervals_from_weights(weight_map)

display(cum_sum_intervals)
display(cum_sum_col_map)

[Interval(0, 0.16666666666666666, closed='left'),
 Interval(0.16666666666666666, 0.3333333333333333, closed='left'),
 Interval(0.3333333333333333, 0.5, closed='left'),
 Interval(0.5, 0.6666666666666666, closed='left'),
 Interval(0.6666666666666666, 0.8333333333333333, closed='left'),
 Interval(0.8333333333333333, 1, closed='both')]

{0: 'age', 1: 'T3', 2: 'T4U', 3: 'TSH', 4: 'TT4', 5: 'FTI'}

In [23]:
picked_cols = sample_cols(remove_vals_df.shape[0], cum_sum_intervals, cum_sum_col_map, random_generator)
missing_vals_df, missing_col_map = create_nan_vals(remove_vals_df, picked_cols)

missing_vals_idxs = list(missing_vals_df.index)

real_df = pd.concat([complete_vals_df, remove_vals_df])
curr_df = pd.concat([complete_vals_df, missing_vals_df])

In [24]:
knn_imputer = KNNImputer(n_neighbors=neighbours)
knn_imputed_mat = knn_imputer.fit_transform(curr_df)

knn_imputed_df = pd.DataFrame(knn_imputed_mat, columns=real_df.columns, index=real_df.index)
knn_imputed_df = knn_imputed_df.loc[missing_vals_idxs]

for col in remove_vals_df.columns:
    knn_imputed_df["{} (real)".format(col)] = remove_vals_df[col]
    knn_imputed_df["{} (imputed)".format(col)] = knn_imputed_df[col]
    knn_imputed_df.drop([col], axis=1, inplace=True)

knn_imputed_df["imputed"] = picked_cols

knn_imputed_df

Unnamed: 0,age (real),age (imputed),T4U (real),T4U (imputed),TT4 (real),TT4 (imputed),TSH (real),TSH (imputed),FTI (real),FTI (imputed),T3 (real),T3 (imputed),imputed
4257,81.0,81.0,0.78,0.780,71.0,71.0,1.900,1.485,91.0,91.0,1.5,1.5,TSH
6728,20.0,20.0,1.06,1.060,109.0,100.5,4.700,4.700,103.0,103.0,3.0,3.0,TT4
460,59.0,59.0,1.07,1.090,98.0,98.0,2.100,2.100,92.0,92.0,2.2,2.2,T4U
1578,53.0,53.0,0.74,0.740,90.0,89.0,0.150,0.150,121.0,121.0,1.1,1.1,TT4
7342,22.0,22.0,1.22,1.210,120.0,120.0,0.660,0.660,98.0,98.0,2.2,2.2,T4U
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,41.0,41.0,1.00,1.015,114.0,114.0,6.000,6.000,114.0,114.0,1.9,1.9,T4U
6722,46.0,46.0,1.00,1.000,131.0,131.0,0.035,0.850,131.0,131.0,4.0,4.0,TSH
7182,47.0,47.0,1.01,0.970,69.0,69.0,1.500,1.500,68.0,68.0,2.9,2.9,T4U
3777,55.0,55.0,1.24,1.240,144.0,101.5,11.000,11.000,116.0,116.0,1.5,1.5,TT4


In [25]:
def imputed_sqr_err(cols : list[str], df : pd.DataFrame) -> dict:
    imputed_map_dict = dict()
    for col in cols:
        imputed_map_dict[col] = []

    for _, sample in df.iterrows():
        imputed_col = sample["imputed"]

        real_val = sample[imputed_col + " (real)"]
        imputed_val = sample[imputed_col + " (imputed)"]

        imputed_map_dict[imputed_col].append((real_val-imputed_val)**2)

    imputed_map = dict()

    for key in imputed_map_dict.keys():
        sqr_err_list = imputed_map_dict[key]
        sqr_err_len = len(sqr_err_list)
        sqr_err_sum = sum(sqr_err_list)
        mean_sqr_err = sqr_err_sum / sqr_err_len
        imputed_map[key] = mean_sqr_err

    return imputed_map

In [26]:
mice_imputer = IterativeImputer(random_state=random_state, max_iter=iters)
mice_imputed_mat = mice_imputer.fit_transform(curr_df)

mice_imputed_df = pd.DataFrame(mice_imputed_mat, columns=real_df.columns, index=real_df.index)
mice_imputed_df = mice_imputed_df.loc[missing_vals_idxs]

for col in remove_vals_df.columns:
    mice_imputed_df["{} (real)".format(col)] = remove_vals_df[col]
    mice_imputed_df["{} (imputed)".format(col)] = mice_imputed_df[col]
    mice_imputed_df.drop([col], axis=1, inplace=True)

mice_imputed_df["imputed"] = picked_cols

mice_imputed_df

Unnamed: 0,age (real),age (imputed),T4U (real),T4U (imputed),TT4 (real),TT4 (imputed),TSH (real),TSH (imputed),FTI (real),FTI (imputed),T3 (real),T3 (imputed),imputed
4257,81.0,81.0,0.78,0.780000,71.0,71.000000,1.900,5.913613,91.0,91.000000,1.5,1.5,TSH
6728,20.0,20.0,1.06,1.060000,109.0,110.324694,4.700,4.700000,103.0,103.000000,3.0,3.0,TT4
460,59.0,59.0,1.07,1.026363,98.0,98.000000,2.100,2.100000,92.0,92.000000,2.2,2.2,T4U
1578,53.0,53.0,0.74,0.740000,90.0,88.079091,0.150,0.150000,121.0,121.000000,1.1,1.1,TT4
7342,22.0,22.0,1.22,1.139541,120.0,120.000000,0.660,0.660000,98.0,98.000000,2.2,2.2,T4U
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,41.0,41.0,1.00,1.009180,114.0,114.000000,6.000,6.000000,114.0,114.000000,1.9,1.9,T4U
6722,46.0,46.0,1.00,1.000000,131.0,131.000000,0.035,-4.061492,131.0,131.000000,4.0,4.0,TSH
7182,47.0,47.0,1.01,0.996210,69.0,69.000000,1.500,1.500000,68.0,68.000000,2.9,2.9,T4U
3777,55.0,55.0,1.24,1.240000,144.0,137.245631,11.000,11.000000,116.0,116.000000,1.5,1.5,TT4


In [27]:
mice_imputed_df[mice_imputed_df["imputed"] == "age"][["age (imputed)", "age (real)"]] 

Unnamed: 0,age (imputed),age (real)
2388,56.835418,74.0
8663,57.670982,60.0
3847,44.903766,15.0
4975,49.135445,15.0
1744,37.513024,64.0
...,...,...
27,52.580429,51.0
4770,54.177372,78.0
2374,52.711451,27.0
396,54.468409,20.0


In [28]:
real_cols = [col + " (real)" for col in sel_qual_cols]
imputed_cols = [col + " (imputed)" for col in sel_qual_cols]

In [29]:
imputed_sqr_err(sel_qual_cols, knn_imputed_df)

{'age': 505.5550458715596,
 'T4U': 0.028625446428571438,
 'TT4': 775.6923076923077,
 'TSH': 1199.6987682537235,
 'FTI': 758.169262264762,
 'T3': 0.9348451149425288}

In [30]:
imputed_sqr_err(sel_qual_cols, mice_imputed_df)

{'age': 300.1211420254812,
 'T4U': 0.008659140526038717,
 'TT4': 67.9018720695164,
 'TSH': 133.0033484327116,
 'FTI': 47.39268623554922,
 'T3': 0.7052811985467698}

In [31]:
display(knn_imputed_df[real_cols].mean())
display(knn_imputed_df[imputed_cols].mean())

age (real)     53.391304
T4U (real)      0.978729
TT4 (real)    104.268729
TSH (real)      7.229992
FTI (real)    108.398161
T3 (real)       1.945368
dtype: float64

age (imputed)     53.700669
T4U (imputed)      0.972559
TT4 (imputed)    103.007860
TSH (imputed)      7.723181
FTI (imputed)    108.304766
T3 (imputed)       1.924958
dtype: float64

In [32]:
display(knn_imputed_df[real_cols].std())
display(knn_imputed_df[imputed_cols].std())

age (real)    18.703471
T4U (real)     0.201820
TT4 (real)    36.788046
TSH (real)    35.280395
FTI (real)    36.049734
T3 (real)      0.839922
dtype: float64

age (imputed)    17.767620
T4U (imputed)     0.186708
TT4 (imputed)    35.436183
TSH (imputed)    38.862895
FTI (imputed)    33.781762
T3 (imputed)      0.775156
dtype: float64