In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [2]:
qual_cols = { "age", "TSH", "T3", "TT4", "T4U", "FTI", "TBG" }
cat_cols = { "sex", "on_thyroxine", "on_antithyroid_meds", "sick", "pregnant", "thyroid_surgery", "I131_treatment", "lithium", "goitre", "tumor", "psych", "referral_source", "target" }

In [3]:
remove_qual_cols = { "TBG" }
remove_cat_cols = set()

In [4]:
k = 10
neighbours = 2

In [5]:
seed = 42

In [6]:
random_generator = np.random.default_rng(seed=seed)

In [7]:
sel_qual_cols = list(qual_cols.difference(remove_qual_cols))
sel_cat_cols = list(cat_cols.difference(remove_cat_cols))
display(sel_qual_cols)
display(sel_cat_cols)

['T4U', 'age', 'FTI', 'T3', 'TT4', 'TSH']

['on_antithyroid_meds',
 'pregnant',
 'I131_treatment',
 'thyroid_surgery',
 'goitre',
 'sick',
 'lithium',
 'target',
 'sex',
 'on_thyroxine',
 'referral_source',
 'tumor',
 'psych']

In [8]:
df = pd.read_csv("./data/thyroidDF.csv")
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027


In [9]:
df.dropna()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
167,40,F,f,f,f,f,f,f,f,f,...,3.9,t,0.83,t,5.0,t,28.0,other,F,840827019
5256,35,F,f,f,f,f,f,t,f,f,...,73.0,t,1.16,t,63.0,t,37.0,other,-,851128040
6044,77,F,f,f,f,f,f,f,f,f,...,120.0,t,0.96,t,124.0,t,45.0,SVI,-,860305064
6045,73,M,f,f,f,f,f,f,f,f,...,89.0,t,0.74,t,119.0,t,24.0,SVI,-,860305065
6747,77,F,f,f,f,f,f,f,f,f,...,131.0,t,1.04,t,126.0,t,25.0,SVI,K,860702030
6773,74,F,f,f,f,f,f,f,f,f,...,116.0,t,0.81,t,143.0,t,22.0,SVI,-,860703046
6862,60,M,f,f,f,f,f,f,f,f,...,92.0,t,0.84,t,110.0,t,21.0,other,-,860710043
6863,66,F,f,f,f,f,f,f,f,f,...,138.0,t,0.8,t,173.0,t,15.0,SVI,-,860710044
6880,42,F,f,f,f,f,f,f,f,f,...,106.0,t,0.98,t,108.0,t,27.0,other,-,860711039
6934,29,F,f,f,f,f,f,f,f,f,...,122.0,t,1.14,t,107.0,t,36.0,SVI,-,860717007


In [10]:
df[sel_qual_cols]

Unnamed: 0,T4U,age,FTI,T3,TT4,TSH
0,,29,,,,0.3
1,,29,,1.9,128.0,1.6
2,,41,,,,
3,,36,,,,
4,,32,,,,
...,...,...,...,...,...,...
9167,0.83,56,77.0,,64.0,
9168,0.92,22,99.0,,91.0,
9169,1.27,69,89.0,,113.0,
9170,0.85,47,88.0,,75.0,


In [11]:
df[sel_cat_cols]

Unnamed: 0,on_antithyroid_meds,pregnant,I131_treatment,thyroid_surgery,goitre,sick,lithium,target,sex,on_thyroxine,referral_source,tumor,psych
0,f,f,f,f,f,f,f,-,F,f,other,f,f
1,f,f,f,f,f,f,f,-,F,f,other,f,f
2,f,f,f,f,f,f,f,-,F,f,other,f,f
3,f,f,f,f,f,f,f,-,F,f,other,f,f
4,f,f,f,f,f,f,f,S,F,f,other,f,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,f,f,f,f,f,f,f,-,M,f,SVI,f,f
9168,f,f,f,f,f,f,f,-,M,f,SVI,f,f
9169,f,f,f,f,f,f,f,I,M,f,SVI,f,f
9170,f,f,f,f,f,f,f,-,F,f,other,f,f


In [12]:
qual_na_df = df[sel_qual_cols].dropna()
qual_na_df

Unnamed: 0,T4U,age,FTI,T3,TT4,TSH
19,1.06,36,85.0,2.4,90.0,1.50
21,1.08,40,96.0,2.3,104.0,1.20
22,0.84,40,105.0,2.1,88.0,5.90
23,1.13,77,95.0,2.4,107.0,0.05
27,0.87,51,106.0,2.1,93.0,0.05
...,...,...,...,...,...,...
9129,0.90,65,94.0,1.8,85.0,0.73
9130,1.19,65,113.0,2.1,135.0,4.10
9134,1.25,74,39.0,1.0,49.0,53.00
9137,0.73,42,81.0,1.3,59.0,2.30


In [13]:
from data_split import k_fold_split

In [14]:
imputed_df_map = dict()

complete_vals_df, remove_vals_df = k_fold_split(qual_na_df, k, random_generator)
missing_vals_idxs = list(remove_vals_df.index)

real_df = pd.concat([complete_vals_df, remove_vals_df])

for col in sel_qual_cols:
    missing_vals_df = remove_vals_df.copy()
    missing_vals_df[col] = np.NaN

    curr_df = pd.concat([complete_vals_df, missing_vals_df])

    imputer = KNNImputer(n_neighbors=neighbours)
    imputed_mat = imputer.fit_transform(curr_df)
    
    imputed_df = pd.DataFrame(imputed_mat, columns=curr_df.columns, index=curr_df.index)
    imputed_df["{} (real)".format(col)] = real_df[col]
    imputed_df["{} (imputed)".format(col)] = imputed_df[col]
    imputed_df.drop([col], axis=1, inplace=True)

    imputed_df = imputed_df.loc[missing_vals_idxs]

    imputed_df_map[col] = imputed_df

In [15]:
for col, imputed_df in imputed_df_map.items():
    display(col)
    display(imputed_df)

'T4U'

Unnamed: 0,age,FTI,T3,TT4,TSH,T4U (real),T4U (imputed)
92,88.0,134.0,0.4,98.0,0.20,0.73,0.745
312,65.0,17.0,0.2,16.0,145.00,0.94,1.020
1347,33.0,188.0,2.2,153.0,0.10,0.81,0.830
4525,85.0,145.0,1.9,140.0,0.58,0.96,0.965
4328,74.0,101.0,2.3,99.0,0.33,0.98,0.995
...,...,...,...,...,...,...,...
7528,55.0,85.0,2.5,93.0,2.30,1.09,1.070
2110,42.0,71.0,2.0,66.0,1.80,0.93,0.940
3210,61.0,97.0,1.9,102.0,1.80,1.06,1.050
4874,90.0,69.0,2.4,73.0,1.80,1.07,1.060


'age'

Unnamed: 0,T4U,FTI,T3,TT4,TSH,age (real),age (imputed)
92,0.73,134.0,0.4,98.0,0.20,88.0,36.5
312,0.94,17.0,0.2,16.0,145.00,65.0,78.5
1347,0.81,188.0,2.2,153.0,0.10,33.0,35.0
4525,0.96,145.0,1.9,140.0,0.58,85.0,39.0
4328,0.98,101.0,2.3,99.0,0.33,74.0,58.5
...,...,...,...,...,...,...,...
7528,1.09,85.0,2.5,93.0,2.30,55.0,57.5
2110,0.93,71.0,2.0,66.0,1.80,42.0,63.5
3210,1.06,97.0,1.9,102.0,1.80,61.0,46.5
4874,1.07,69.0,2.4,73.0,1.80,90.0,66.5


'FTI'

Unnamed: 0,T4U,age,T3,TT4,TSH,FTI (real),FTI (imputed)
92,0.73,88.0,0.4,98.0,0.20,134.0,95.5
312,0.94,65.0,0.2,16.0,145.00,17.0,9.0
1347,0.81,33.0,2.2,153.0,0.10,188.0,123.5
4525,0.96,85.0,1.9,140.0,0.58,145.0,146.5
4328,0.98,74.0,2.3,99.0,0.33,101.0,90.5
...,...,...,...,...,...,...,...
7528,1.09,55.0,2.5,93.0,2.30,85.0,101.5
2110,0.93,42.0,2.0,66.0,1.80,71.0,73.5
3210,1.06,61.0,1.9,102.0,1.80,97.0,118.5
4874,1.07,90.0,2.4,73.0,1.80,69.0,66.5


'T3'

Unnamed: 0,T4U,age,FTI,TT4,TSH,T3 (real),T3 (imputed)
92,0.73,88.0,134.0,98.0,0.20,0.4,1.450
312,0.94,65.0,17.0,16.0,145.00,0.2,0.225
1347,0.81,33.0,188.0,153.0,0.10,2.2,1.650
4525,0.96,85.0,145.0,140.0,0.58,1.9,1.950
4328,0.98,74.0,101.0,99.0,0.33,2.3,1.300
...,...,...,...,...,...,...,...
7528,1.09,55.0,85.0,93.0,2.30,2.5,2.050
2110,0.93,42.0,71.0,66.0,1.80,2.0,2.400
3210,1.06,61.0,97.0,102.0,1.80,1.9,2.100
4874,1.07,90.0,69.0,73.0,1.80,2.4,1.400


'TT4'

Unnamed: 0,T4U,age,FTI,T3,TSH,TT4 (real),TT4 (imputed)
92,0.73,88.0,134.0,0.4,0.20,98.0,114.5
312,0.94,65.0,17.0,0.2,145.00,16.0,24.5
1347,0.81,33.0,188.0,2.2,0.10,153.0,162.5
4525,0.96,85.0,145.0,1.9,0.58,140.0,135.0
4328,0.98,74.0,101.0,2.3,0.33,99.0,120.5
...,...,...,...,...,...,...,...
7528,1.09,55.0,85.0,2.5,2.30,93.0,78.5
2110,0.93,42.0,71.0,2.0,1.80,66.0,83.0
3210,1.06,61.0,97.0,1.9,1.80,102.0,93.5
4874,1.07,90.0,69.0,2.4,1.80,73.0,70.5


'TSH'

Unnamed: 0,T4U,age,FTI,T3,TT4,TSH (real),TSH (imputed)
92,0.73,88.0,134.0,0.4,98.0,0.20,1.3000
312,0.94,65.0,17.0,0.2,16.0,145.00,55.0000
1347,0.81,33.0,188.0,2.2,153.0,0.10,0.2000
4525,0.96,85.0,145.0,1.9,140.0,0.58,1.3750
4328,0.98,74.0,101.0,2.3,99.0,0.33,0.6750
...,...,...,...,...,...,...,...
7528,1.09,55.0,85.0,2.5,93.0,2.30,3.1000
2110,0.93,42.0,71.0,2.0,66.0,1.80,0.6525
3210,1.06,61.0,97.0,1.9,102.0,1.80,0.8000
4874,1.07,90.0,69.0,2.4,73.0,1.80,1.6250
