In [1]:
'''
%pip install scikit-learn==1.1.2
%pip install scipy==1.9.1
%pip install missingpy==0.2.0
'''

'\n%pip install scikit-learn==1.1.2\n%pip install scipy==1.9.1\n%pip install missingpy==0.2.0\n'

In [2]:
import numpy as np
import pandas as pd
from utils.data_split import k_fold_split
from utils.forgetter import forget_random_col_per_sample
from utils.comparison_utils import compare_imputations
from utils.category_utils import transform_categorical_columns, inverse_transform_categorical_columns

In [3]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

In [4]:
import warnings
warnings.simplefilter('ignore')

In [5]:
qual_cols = { "age", "TSH", "T3", "TT4", "T4U", "FTI", "TBG" }
cat_cols = { "sick", "referral_source", "on_antithyroid_meds", "lithium", "goitre", "tumor", "thyroid_surgery", "sex", "target", "on_thyroxine", "psych", "I131_treatment", "pregnant" }

In [6]:
remove_qual_cols = { "TBG" }
remove_cat_cols = { "referral_source", "target" }

In [7]:
sel_qual_cols = list(qual_cols.difference(remove_qual_cols))
sel_cat_cols = list(cat_cols.difference(remove_cat_cols))
display(sel_qual_cols)
display(sel_cat_cols)

['TT4', 'T3', 'T4U', 'age', 'TSH', 'FTI']

['psych',
 'lithium',
 'I131_treatment',
 'pregnant',
 'sick',
 'sex',
 'on_antithyroid_meds',
 'tumor',
 'thyroid_surgery',
 'goitre',
 'on_thyroxine']

In [8]:
df = pd.read_csv("./data/thyroidDF.csv")
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027


In [9]:
transformed_df, label_encoders = transform_categorical_columns(df, sel_cat_cols)
transformed_df = transformed_df[sel_cat_cols+sel_qual_cols]
transformed_df

Unnamed: 0,psych,lithium,I131_treatment,pregnant,sick,sex,on_antithyroid_meds,tumor,thyroid_surgery,goitre,on_thyroxine,TT4,T3,T4U,age,TSH,FTI
0,0,0,0,0,0,0,0,0,0,0,0,,,,29,0.3,
1,0,0,0,0,0,0,0,0,0,0,0,128.0,1.9,,29,1.6,
2,0,0,0,0,0,0,0,0,0,0,0,,,,41,,
3,0,0,0,0,0,0,0,0,0,0,0,,,,36,,
4,0,0,0,0,0,0,0,0,0,0,0,,,,32,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,0,0,0,0,0,1,0,0,0,0,0,64.0,,0.83,56,,77.0
9168,0,0,0,0,0,1,0,0,0,0,0,91.0,,0.92,22,,99.0
9169,0,0,0,0,0,1,0,0,0,0,0,113.0,,1.27,69,,89.0
9170,0,0,0,0,0,0,0,0,0,0,0,75.0,,0.85,47,,88.0


In [10]:
transformed_df = transformed_df.dropna()
transformed_df.shape

(5984, 17)

In [11]:
weight_qual_map = {
    "age" : 1,
    "T3"  : 1,
    "T4U" : 1,
    "TSH" : 1,
    "TT4" : 1,
    "FTI" : 1
}

weight_cat_map = {
    "sick" : 1,
    "referral_source" : 1,
    "on_antithyroid_meds" : 1,
    "lithium" : 1,
    "goitre" : 1,
    "tumor" : 1,
    "thyroid_surgery" : 1,
    "sex" : 1,
    "target" : 1,
    "on_thyroxine" : 1,
    "psych" : 1,
    "I131_treatment" : 1,
    "pregnant" : 1
}

weight_map = weight_qual_map | weight_cat_map
print(weight_map)

{'age': 1, 'T3': 1, 'T4U': 1, 'TSH': 1, 'TT4': 1, 'FTI': 1, 'sick': 1, 'referral_source': 1, 'on_antithyroid_meds': 1, 'lithium': 1, 'goitre': 1, 'tumor': 1, 'thyroid_surgery': 1, 'sex': 1, 'target': 1, 'on_thyroxine': 1, 'psych': 1, 'I131_treatment': 1, 'pregnant': 1}


In [12]:
k = 5

In [13]:
print(f"train: {transformed_df.shape[0] / k * (k-1)}\ntest: {transformed_df.shape[0] / k}")

train: 4787.2
test: 1196.8


In [14]:
train_df, test_df = k_fold_split(transformed_df, k)

In [15]:
seed = 11
random_generator = np.random.default_rng(seed)

In [16]:
train_missing_vals_df, train_missing_col_map, train_missing_vals_idxs = forget_random_col_per_sample(train_df, weight_map, random_generator)

In [17]:
train_missing_vals_df.shape

(4787, 17)

In [18]:
train_missing_vals_df.columns

Index(['psych', 'lithium', 'I131_treatment', 'pregnant', 'sick', 'sex',
       'on_antithyroid_meds', 'tumor', 'thyroid_surgery', 'goitre',
       'on_thyroxine', 'TT4', 'T3', 'T4U', 'age', 'TSH', 'FTI'],
      dtype='object')

In [19]:
imputer = MissForest(missing_values=np.NaN)
X = train_missing_vals_df.to_numpy()
X_imputed = imputer.fit_transform(X, cat_vars=list(range(len(sel_cat_cols))))

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [20]:
imputed_df = pd.DataFrame(X_imputed, columns=sel_cat_cols+sel_qual_cols)
imputed_data_df = inverse_transform_categorical_columns(imputed_df.copy(), label_encoders)
imputed_data_df

Unnamed: 0,psych,lithium,I131_treatment,pregnant,sick,sex,on_antithyroid_meds,tumor,thyroid_surgery,goitre,on_thyroxine,TT4,T3,T4U,age,TSH,FTI
0,f,f,f,f,f,F,f,t,f,f,f,96.00,2.5,1.2783,37.00,13.00,73.0
1,f,f,f,f,f,,f,f,f,f,f,76.00,1.7,0.8600,78.00,0.30,88.0
2,f,f,f,f,f,F,f,f,f,f,t,37.00,2.4,0.8100,39.00,0.07,45.0
3,f,f,f,f,f,F,f,f,f,f,f,78.00,1.8,1.0100,48.99,0.81,77.0
4,f,t,f,f,f,F,f,f,f,f,f,111.00,1.8,1.0193,35.00,1.10,109.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4782,f,f,f,f,f,F,f,f,f,f,f,70.00,1.6,0.9300,84.00,0.70,75.0
4783,f,t,f,f,f,M,f,f,f,f,f,98.00,2.2,0.9300,45.00,2.30,105.0
4784,f,f,f,f,f,M,f,f,f,f,f,157.00,1.3,0.9100,65.00,2.30,172.0
4785,f,f,f,f,f,F,f,f,f,f,f,93.79,0.5,0.8300,63.00,14.00,114.0


In [21]:
imputed_means = np.concatenate([imputer.statistics_['col_modes'].flatten(), imputer.statistics_['col_means']])
imputed_means_df = pd.DataFrame([imputed_means], columns=sel_cat_cols+sel_qual_cols)
imputed_means_df = inverse_transform_categorical_columns(imputed_means_df, label_encoders)
imputed_means_df

Unnamed: 0,psych,lithium,I131_treatment,pregnant,sick,sex,on_antithyroid_meds,tumor,thyroid_surgery,goitre,on_thyroxine,TT4,T3,T4U,age,TSH,FTI
0,f,f,f,f,f,F,f,f,f,f,f,108.268181,1.966452,0.97588,67.89923,5.279069,113.197563


In [22]:
results = compare_imputations(train_df, imputed_df)
results

Unnamed: 0,Column,MSE,RMSE,MAE
0,psych,0.002507,0.050068,0.002507
1,lithium,0.000836,0.028907,0.000836
2,I131_treatment,0.000627,0.025034,0.000627
3,pregnant,0.001044,0.032319,0.001044
4,sick,0.002089,0.045705,0.002089
5,sex,0.026739,0.163521,0.021308
6,on_antithyroid_meds,0.000836,0.028907,0.000836
7,tumor,0.001671,0.04088,0.001671
8,thyroid_surgery,0.000627,0.025034,0.000627
9,goitre,0.0,0.0,0.0
