In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [2]:
qual_cols = { "age", "TSH", "T3", "TT4", "T4U", "FTI", "TBG" }
cat_cols = { "sex", "on_thyroxine", "on_antithyroid_meds", "sick", "pregnant", "thyroid_surgery", "I131_treatment", "lithium", "goitre", "tumor", "psych", "referral_source", "target" }

In [3]:
remove_qual_cols = { "TBG" }
remove_cat_cols = set()

In [4]:
k = 10
neighbours = 2

In [5]:
seed = 42

In [6]:
random_generator = np.random.default_rng(seed=seed)

In [7]:
sel_qual_cols = list(qual_cols.difference(remove_qual_cols))
sel_cat_cols = list(cat_cols.difference(remove_cat_cols))
display(sel_qual_cols)
display(sel_cat_cols)

['FTI', 'T4U', 'T3', 'TSH', 'TT4', 'age']

['sex',
 'I131_treatment',
 'psych',
 'on_thyroxine',
 'target',
 'goitre',
 'lithium',
 'pregnant',
 'tumor',
 'referral_source',
 'sick',
 'thyroid_surgery',
 'on_antithyroid_meds']

In [8]:
df = pd.read_csv("./data/thyroidDF.csv")
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027


In [9]:
df.dropna()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
167,40,F,f,f,f,f,f,f,f,f,...,3.9,t,0.83,t,5.0,t,28.0,other,F,840827019
5256,35,F,f,f,f,f,f,t,f,f,...,73.0,t,1.16,t,63.0,t,37.0,other,-,851128040
6044,77,F,f,f,f,f,f,f,f,f,...,120.0,t,0.96,t,124.0,t,45.0,SVI,-,860305064
6045,73,M,f,f,f,f,f,f,f,f,...,89.0,t,0.74,t,119.0,t,24.0,SVI,-,860305065
6747,77,F,f,f,f,f,f,f,f,f,...,131.0,t,1.04,t,126.0,t,25.0,SVI,K,860702030
6773,74,F,f,f,f,f,f,f,f,f,...,116.0,t,0.81,t,143.0,t,22.0,SVI,-,860703046
6862,60,M,f,f,f,f,f,f,f,f,...,92.0,t,0.84,t,110.0,t,21.0,other,-,860710043
6863,66,F,f,f,f,f,f,f,f,f,...,138.0,t,0.8,t,173.0,t,15.0,SVI,-,860710044
6880,42,F,f,f,f,f,f,f,f,f,...,106.0,t,0.98,t,108.0,t,27.0,other,-,860711039
6934,29,F,f,f,f,f,f,f,f,f,...,122.0,t,1.14,t,107.0,t,36.0,SVI,-,860717007


In [10]:
df[sel_qual_cols]

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
0,,,,0.3,,29
1,,,1.9,1.6,128.0,29
2,,,,,,41
3,,,,,,36
4,,,,,,32
...,...,...,...,...,...,...
9167,77.0,0.83,,,64.0,56
9168,99.0,0.92,,,91.0,22
9169,89.0,1.27,,,113.0,69
9170,88.0,0.85,,,75.0,47


In [11]:
df[sel_cat_cols]

Unnamed: 0,sex,I131_treatment,psych,on_thyroxine,target,goitre,lithium,pregnant,tumor,referral_source,sick,thyroid_surgery,on_antithyroid_meds
0,F,f,f,f,-,f,f,f,f,other,f,f,f
1,F,f,f,f,-,f,f,f,f,other,f,f,f
2,F,f,f,f,-,f,f,f,f,other,f,f,f
3,F,f,f,f,-,f,f,f,f,other,f,f,f
4,F,f,f,f,S,f,f,f,f,other,f,f,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,M,f,f,f,-,f,f,f,f,SVI,f,f,f
9168,M,f,f,f,-,f,f,f,f,SVI,f,f,f
9169,M,f,f,f,I,f,f,f,f,SVI,f,f,f
9170,F,f,f,f,-,f,f,f,f,other,f,f,f


In [12]:
qual_na_df = df[sel_qual_cols].dropna()
qual_na_df

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
19,85.0,1.06,2.4,1.50,90.0,36
21,96.0,1.08,2.3,1.20,104.0,40
22,105.0,0.84,2.1,5.90,88.0,40
23,95.0,1.13,2.4,0.05,107.0,77
27,106.0,0.87,2.1,0.05,93.0,51
...,...,...,...,...,...,...
9129,94.0,0.90,1.8,0.73,85.0,65
9130,113.0,1.19,2.1,4.10,135.0,65
9134,39.0,1.25,1.0,53.00,49.0,74
9137,81.0,0.73,1.3,2.30,59.0,42


In [13]:
from data_split import k_fold_split

In [14]:
complete_vals_df, missing_vals_df = k_fold_split(qual_na_df, k, random_generator)

In [15]:
complete_vals_df

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
7881,115.0,1.08,1.5,1.300000,124.0,48.0
403,107.0,0.70,2.0,1.300000,75.0,51.0
7412,95.0,0.91,2.2,10.000000,86.0,34.0
8483,120.0,0.97,2.4,1.800000,116.0,50.0
1846,118.0,1.04,2.3,0.800000,124.0,80.0
...,...,...,...,...,...,...
5710,104.0,1.08,2.0,0.380000,113.0,65511.0
682,266.0,0.50,1.9,0.100000,133.0,46.0
6393,149.0,1.48,2.4,0.030000,220.0,28.0
8767,35.0,0.97,1.7,16.799999,34.0,57.0


In [16]:
missing_vals_df

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
92,134.0,0.73,0.4,0.20,98.0,88.0
312,17.0,0.94,0.2,145.00,16.0,65.0
1347,188.0,0.81,2.2,0.10,153.0,33.0
4525,145.0,0.96,1.9,0.58,140.0,85.0
4328,101.0,0.98,2.3,0.33,99.0,74.0
...,...,...,...,...,...,...
7528,85.0,1.09,2.5,2.30,93.0,55.0
2110,71.0,0.93,2.0,1.80,66.0,42.0
3210,97.0,1.06,1.9,1.80,102.0,61.0
4874,69.0,1.07,2.4,1.80,73.0,90.0


In [17]:
missing_vals_idxs = set(missing_vals_df.index)

In [18]:
missing_vals_T3_df = missing_vals_df.copy()
missing_vals_T3_df["T3"] = np.NaN
missing_vals_T3_df

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
92,134.0,0.73,,0.20,98.0,88.0
312,17.0,0.94,,145.00,16.0,65.0
1347,188.0,0.81,,0.10,153.0,33.0
4525,145.0,0.96,,0.58,140.0,85.0
4328,101.0,0.98,,0.33,99.0,74.0
...,...,...,...,...,...,...
7528,85.0,1.09,,2.30,93.0,55.0
2110,71.0,0.93,,1.80,66.0,42.0
3210,97.0,1.06,,1.80,102.0,61.0
4874,69.0,1.07,,1.80,73.0,90.0


In [19]:
complete_df = pd.concat([complete_vals_df, missing_vals_df])
complete_df

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
7881,115.0,1.08,1.5,1.3,124.0,48.0
403,107.0,0.70,2.0,1.3,75.0,51.0
7412,95.0,0.91,2.2,10.0,86.0,34.0
8483,120.0,0.97,2.4,1.8,116.0,50.0
1846,118.0,1.04,2.3,0.8,124.0,80.0
...,...,...,...,...,...,...
7528,85.0,1.09,2.5,2.3,93.0,55.0
2110,71.0,0.93,2.0,1.8,66.0,42.0
3210,97.0,1.06,1.9,1.8,102.0,61.0
4874,69.0,1.07,2.4,1.8,73.0,90.0


In [20]:
T3_df = pd.concat([complete_vals_df, missing_vals_T3_df])
T3_df

Unnamed: 0,FTI,T4U,T3,TSH,TT4,age
7881,115.0,1.08,1.5,1.3,124.0,48.0
403,107.0,0.70,2.0,1.3,75.0,51.0
7412,95.0,0.91,2.2,10.0,86.0,34.0
8483,120.0,0.97,2.4,1.8,116.0,50.0
1846,118.0,1.04,2.3,0.8,124.0,80.0
...,...,...,...,...,...,...
7528,85.0,1.09,,2.3,93.0,55.0
2110,71.0,0.93,,1.8,66.0,42.0
3210,97.0,1.06,,1.8,102.0,61.0
4874,69.0,1.07,,1.8,73.0,90.0


In [21]:
imputer = KNNImputer(n_neighbors=neighbours)
filled = imputer.fit_transform(T3_df)
df_filled = pd.DataFrame(filled, columns=T3_df.columns, index=T3_df.index)
df_filled["T3 (real)"] = complete_df["T3"]
df_filled["T3 (imputed)"] = df_filled["T3"]
df_filled.drop(["T3"], axis=1, inplace=True)
df_filled

Unnamed: 0,FTI,T4U,TSH,TT4,age,T3 (real),T3 (imputed)
7881,115.0,1.08,1.3,124.0,48.0,1.5,1.50
403,107.0,0.70,1.3,75.0,51.0,2.0,2.00
7412,95.0,0.91,10.0,86.0,34.0,2.2,2.20
8483,120.0,0.97,1.8,116.0,50.0,2.4,2.40
1846,118.0,1.04,0.8,124.0,80.0,2.3,2.30
...,...,...,...,...,...,...,...
7528,85.0,1.09,2.3,93.0,55.0,2.5,2.05
2110,71.0,0.93,1.8,66.0,42.0,2.0,2.40
3210,97.0,1.06,1.8,102.0,61.0,1.9,2.10
4874,69.0,1.07,1.8,73.0,90.0,2.4,1.40
