In [None]:
'''
%pip install scikit-learn==1.1.2
%pip install scipy==1.9.1
%pip install missingpy==0.2.0
'''

In [1]:
import numpy as np
import pandas as pd
from utils.data_split import k_fold_split
from utils.forgetter import forget_random_col_per_sample
from utils.category_utils import transform_categorical_columns, inverse_transform_categorical_columns

In [2]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

In [3]:
import warnings
warnings.simplefilter('ignore')

In [4]:
qual_cols = { "age", "TSH", "T3", "TT4", "T4U", "FTI", "TBG" }
cat_cols = { "sick", "referral_source", "on_antithyroid_meds", "lithium", "goitre", "tumor", "thyroid_surgery", "sex", "target", "on_thyroxine", "psych", "I131_treatment", "pregnant" }

In [5]:
remove_qual_cols = { "TBG" }
remove_cat_cols = { "referral_source", "target" }

In [6]:
sel_qual_cols = list(qual_cols.difference(remove_qual_cols))
sel_cat_cols = list(cat_cols.difference(remove_cat_cols))
display(sel_qual_cols)
display(sel_cat_cols)

['T4U', 'FTI', 'TSH', 'TT4', 'T3', 'age']

['I131_treatment',
 'sex',
 'sick',
 'pregnant',
 'goitre',
 'lithium',
 'tumor',
 'psych',
 'thyroid_surgery',
 'on_antithyroid_meds',
 'on_thyroxine']

In [7]:
df = pd.read_csv("./data/thyroidDF.csv")
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027


In [8]:
transformed_df, label_encoders = transform_categorical_columns(df, sel_cat_cols)
transformed_df = transformed_df[sel_cat_cols+sel_qual_cols]
transformed_df

Unnamed: 0,I131_treatment,sex,sick,pregnant,goitre,lithium,tumor,psych,thyroid_surgery,on_antithyroid_meds,on_thyroxine,T4U,FTI,TSH,TT4,T3,age
0,0,0,0,0,0,0,0,0,0,0,0,,,0.3,,,29
1,0,0,0,0,0,0,0,0,0,0,0,,,1.6,128.0,1.9,29
2,0,0,0,0,0,0,0,0,0,0,0,,,,,,41
3,0,0,0,0,0,0,0,0,0,0,0,,,,,,36
4,0,0,0,0,0,0,0,0,0,0,0,,,,,,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,0,1,0,0,0,0,0,0,0,0,0,0.83,77.0,,64.0,,56
9168,0,1,0,0,0,0,0,0,0,0,0,0.92,99.0,,91.0,,22
9169,0,1,0,0,0,0,0,0,0,0,0,1.27,89.0,,113.0,,69
9170,0,0,0,0,0,0,0,0,0,0,0,0.85,88.0,,75.0,,47


In [9]:
transformed_df = transformed_df.dropna()
transformed_df.shape

(5984, 17)

In [10]:
weight_qual_map = {
    "age" : 1,
    "T3"  : 1,
    "T4U" : 1,
    "TSH" : 1,
    "TT4" : 1,
    "FTI" : 1
}

weight_cat_map = {
    "sick" : 1,
    "referral_source" : 1,
    "on_antithyroid_meds" : 1,
    "lithium" : 1,
    "goitre" : 1,
    "tumor" : 1,
    "thyroid_surgery" : 1,
    "sex" : 1,
    "target" : 1,
    "on_thyroxine" : 1,
    "psych" : 1,
    "I131_treatment" : 1,
    "pregnant" : 1
}

weight_map = weight_qual_map | weight_cat_map
print(weight_map)

{'age': 1, 'T3': 1, 'T4U': 1, 'TSH': 1, 'TT4': 1, 'FTI': 1, 'sick': 1, 'referral_source': 1, 'on_antithyroid_meds': 1, 'lithium': 1, 'goitre': 1, 'tumor': 1, 'thyroid_surgery': 1, 'sex': 1, 'target': 1, 'on_thyroxine': 1, 'psych': 1, 'I131_treatment': 1, 'pregnant': 1}


In [11]:
k = 5

In [12]:
print(f"train: {transformed_df.shape[0] / k * (k-1)}\ntest: {transformed_df.shape[0] / k}")

train: 4787.2
test: 1196.8


In [13]:
train_df, test_df = k_fold_split(transformed_df, k)

In [14]:
seed = 11
random_generator = np.random.default_rng(seed)

In [15]:
train_missing_vals_df, train_missing_col_map, train_missing_vals_idxs = forget_random_col_per_sample(train_df, weight_map, random_generator)

In [16]:
train_missing_vals_df.shape

(4787, 17)

In [17]:
train_df.columns

Index(['I131_treatment', 'sex', 'sick', 'pregnant', 'goitre', 'lithium',
       'tumor', 'psych', 'thyroid_surgery', 'on_antithyroid_meds',
       'on_thyroxine', 'T4U', 'FTI', 'TSH', 'TT4', 'T3', 'age'],
      dtype='object')

In [18]:
imputer = MissForest(missing_values=np.NaN)
X = train_df.to_numpy()
X_imputed = imputer.fit_transform(X, cat_vars=list(range(len(sel_cat_cols))))

In [19]:
imputer.statistics_['col_means']

array([  0.97498642, 113.20124086,   5.27718717, 108.23581784,
         1.95940881,  80.74952998])

In [20]:
imputer.statistics_['col_modes']

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [21]:
imputed_df = pd.DataFrame(X_imputed, columns=sel_cat_cols+sel_qual_cols)
imputed_df = inverse_transform_categorical_columns(imputed_df, label_encoders)
imputed_df

Unnamed: 0,I131_treatment,sex,sick,pregnant,goitre,lithium,tumor,psych,thyroid_surgery,on_antithyroid_meds,on_thyroxine,T4U,FTI,TSH,TT4,T3,age
0,f,F,f,f,f,f,f,f,f,f,f,0.97,101.0,1.80,98.0,2.3,55.0
1,f,F,f,f,f,f,f,t,f,f,f,0.94,100.0,1.10,94.0,1.8,80.0
2,f,F,f,f,f,f,f,f,f,f,f,0.80,231.0,0.05,185.0,1.7,50.0
3,f,F,f,f,f,f,f,f,f,f,f,1.05,118.0,0.91,124.0,2.4,46.0
4,f,F,f,f,f,f,f,f,f,f,f,0.99,13.0,60.00,13.0,0.3,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4782,f,,f,f,f,f,f,f,f,f,f,0.38,118.0,0.35,44.0,0.3,70.0
4783,f,M,f,f,f,f,f,f,f,f,f,1.02,90.0,1.30,91.0,2.5,62.0
4784,f,F,f,f,f,f,f,f,f,f,f,1.05,69.0,1.40,72.0,1.6,72.0
4785,f,F,f,f,f,f,f,f,f,f,f,1.34,129.0,0.90,172.0,2.6,22.0


In [22]:
imputed_means = np.concatenate([imputer.statistics_['col_modes'].flatten(), imputer.statistics_['col_means']])
imputed_means_df = pd.DataFrame([imputed_means], columns=sel_cat_cols+sel_qual_cols)
imputed_means_df = inverse_transform_categorical_columns(imputed_means_df, label_encoders)
imputed_means_df

Unnamed: 0,I131_treatment,sex,sick,pregnant,goitre,lithium,tumor,psych,thyroid_surgery,on_antithyroid_meds,on_thyroxine,T4U,FTI,TSH,TT4,T3,age
0,f,F,f,f,f,f,f,f,f,f,f,0.974986,113.201241,5.277187,108.235818,1.959409,80.74953
