# Predictive Thyroid Disease Project - Feature Engineering

To make a predictive model for thyroid cancer patients to see how likely after treatment their cancer will reoccur.

# Initial Set Up

In [1]:
# General Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import datetime
import math

# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [2]:
# Load your cleaned CSV file
df = pd.read_csv('3_Thyroid_Disease_EDA.csv') # Change it to Thyroid_Disease_EDA

In [3]:
df.head()

Unnamed: 0,age,age_group,gender,smoking,hx_smoking,hx_radiotherapy,thyroid_function,thyroid_function_3cat,thyroid_function_2cat,physical_examination,...,t_4cat,n,m,stage,stage_4cat,stage_2cat,response,response_3cat,recurrence,kmode_cluster_3cat
0,27,20s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-left,...,t1,n0,m0,i,i,early,indeterminate,indeterminate,no,0
1,34,30s,f,no,yes,no,euthyroid,normal,normal,multinodular_goiter,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
2,30,30s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
3,62,60s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
4,62,60s,f,no,no,no,euthyroid,normal,normal,multinodular_goiter,...,t1,n0,m0,i,i,early,excellent,excellent,no,0


In [4]:
# check the columns data type before edit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    383 non-null    int64 
 1   age_group              383 non-null    object
 2   gender                 383 non-null    object
 3   smoking                383 non-null    object
 4   hx_smoking             383 non-null    object
 5   hx_radiotherapy        383 non-null    object
 6   thyroid_function       383 non-null    object
 7   thyroid_function_3cat  383 non-null    object
 8   thyroid_function_2cat  383 non-null    object
 9   physical_examination   383 non-null    object
 10  adenopathy             383 non-null    object
 11  pathology              383 non-null    object
 12  focality               383 non-null    object
 13  risk                   383 non-null    object
 14  t                      383 non-null    object
 15  t_4cat                 

In [5]:
# again check for missing data, did it in Data Wrangling but double check if any EDA process messed it up.
df.isnull().sum()

age                      0
age_group                0
gender                   0
smoking                  0
hx_smoking               0
hx_radiotherapy          0
thyroid_function         0
thyroid_function_3cat    0
thyroid_function_2cat    0
physical_examination     0
adenopathy               0
pathology                0
focality                 0
risk                     0
t                        0
t_4cat                   0
n                        0
m                        0
stage                    0
stage_4cat               0
stage_2cat               0
response                 0
response_3cat            0
recurrence               0
kmode_cluster_3cat       0
dtype: int64

# Encoding Set Up

In [6]:
# check for binary columns 
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_names

['gender',
 'smoking',
 'hx_smoking',
 'hx_radiotherapy',
 'thyroid_function_2cat',
 'focality',
 'm',
 'stage_2cat',
 'recurrence']

In [7]:
# make a list of non binary columns
all_columns = set(df.columns)
binary_columns_set = set(binary_column_names)
non_binary_column_names = list(all_columns - binary_columns_set)

non_binary_column_names

['physical_examination',
 'age',
 'age_group',
 't',
 'adenopathy',
 'kmode_cluster_3cat',
 'risk',
 'stage_4cat',
 'response_3cat',
 'stage',
 't_4cat',
 'pathology',
 'n',
 'response',
 'thyroid_function_3cat',
 'thyroid_function']

# Dummy Encoding

binary values into bools 0s and 1s

In [8]:
binary_dummy_encoded_df = pd.get_dummies(df[binary_column_names], drop_first=True)

binary_dummy_encoded_df

Unnamed: 0,gender_m,smoking_yes,hx_smoking_yes,hx_radiotherapy_yes,thyroid_function_2cat_normal,focality_uni-focal,m_m1,stage_2cat_late,recurrence_yes
0,False,False,False,False,True,True,False,False,False
1,False,False,True,False,True,True,False,False,False
2,False,False,False,False,True,True,False,False,False
3,False,False,False,False,True,True,False,False,False
4,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...
378,True,True,True,True,True,True,True,True,True
379,True,True,False,True,True,False,True,True,True
380,True,True,True,False,True,False,True,True,True
381,True,True,True,True,False,False,False,True,True


In [9]:
# add suffix to columns 
binary_dummy_encoded_df = MyCustDataSciLib.add_suffix_to_columns(binary_dummy_encoded_df,'_dummy_encoded')
binary_dummy_encoded_df

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded
0,False,False,False,False,True,True,False,False,False
1,False,False,True,False,True,True,False,False,False
2,False,False,False,False,True,True,False,False,False
3,False,False,False,False,True,True,False,False,False
4,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...
378,True,True,True,True,True,True,True,True,True
379,True,True,False,True,True,False,True,True,True
380,True,True,True,False,True,False,True,True,True
381,True,True,True,True,False,False,False,True,True


# One Hot Encoding

Each category in a column will be it's own category


In [12]:
# make a new df of non binary columns 
one_hot_encoded_df = df[non_binary_column_names].drop(columns=['age'])
one_hot_encoded_df

Unnamed: 0,physical_examination,age_group,t,adenopathy,kmode_cluster_3cat,risk,stage_4cat,response_3cat,stage,t_4cat,pathology,n,response,thyroid_function_3cat,thyroid_function
0,single_nodular_goiter-left,20s,t1a,no,0,low,i,indeterminate,i,t1,micropapillary,n0,indeterminate,normal,euthyroid
1,multinodular_goiter,30s,t1a,no,0,low,i,excellent,i,t1,micropapillary,n0,excellent,normal,euthyroid
2,single_nodular_goiter-right,30s,t1a,no,0,low,i,excellent,i,t1,micropapillary,n0,excellent,normal,euthyroid
3,single_nodular_goiter-right,60s,t1a,no,0,low,i,excellent,i,t1,micropapillary,n0,excellent,normal,euthyroid
4,multinodular_goiter,60s,t1a,no,0,low,i,excellent,i,t1,micropapillary,n0,excellent,normal,euthyroid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,single_nodular_goiter-right,70s,t4b,right,2,high,iv,negative,ivb,t4,papillary,n1b,biochemical_incomplete,normal,euthyroid
379,multinodular_goiter,80s,t4b,extensive,2,high,iv,negative,ivb,t4,papillary,n1b,structural_incomplete,normal,euthyroid
380,multinodular_goiter,70s,t4b,bilateral,2,high,iv,negative,ivb,t4,papillary,n1b,structural_incomplete,normal,euthyroid
381,multinodular_goiter,60s,t4b,extensive,2,high,iv,negative,iva,t4,hurthel_cell,n1b,structural_incomplete,clinical,clinical_hyperthyroidism


In [13]:
# pd.get_dummies set drop_first=True for one hot encoding
one_hot_encoded_df = pd.get_dummies(one_hot_encoded_df, drop_first=False)
one_hot_encoded_df

Unnamed: 0,kmode_cluster_3cat,physical_examination_diffuse_goiter,physical_examination_multinodular_goiter,physical_examination_normal,physical_examination_single_nodular_goiter-left,physical_examination_single_nodular_goiter-right,age_group_10s,age_group_20s,age_group_30s,age_group_40s,...,response_indeterminate,response_structural_incomplete,thyroid_function_3cat_clinical,thyroid_function_3cat_normal,thyroid_function_3cat_subclinical,thyroid_function_clinical_hyperthyroidism,thyroid_function_clinical_hypothyroidism,thyroid_function_euthyroid,thyroid_function_subclinical_hyperthyroidism,thyroid_function_subclinical_hypothyroidism
0,0,False,False,False,True,False,False,True,False,False,...,True,False,False,True,False,False,False,True,False,False
1,0,False,True,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,True,False,False
2,0,False,False,False,False,True,False,False,True,False,...,False,False,False,True,False,False,False,True,False,False
3,0,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
4,0,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,2,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
379,2,False,True,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
380,2,False,True,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
381,2,False,True,False,False,False,False,False,False,False,...,False,True,True,False,False,True,False,False,False,False


In [14]:
# add suffix to columns 
one_hot_encoded_df = MyCustDataSciLib.add_suffix_to_columns(one_hot_encoded_df,'_one_hot_encoded')
one_hot_encoded_df

Unnamed: 0,kmode_cluster_3cat_one_hot_encoded,physical_examination_diffuse_goiter_one_hot_encoded,physical_examination_multinodular_goiter_one_hot_encoded,physical_examination_normal_one_hot_encoded,physical_examination_single_nodular_goiter-left_one_hot_encoded,physical_examination_single_nodular_goiter-right_one_hot_encoded,age_group_10s_one_hot_encoded,age_group_20s_one_hot_encoded,age_group_30s_one_hot_encoded,age_group_40s_one_hot_encoded,...,response_indeterminate_one_hot_encoded,response_structural_incomplete_one_hot_encoded,thyroid_function_3cat_clinical_one_hot_encoded,thyroid_function_3cat_normal_one_hot_encoded,thyroid_function_3cat_subclinical_one_hot_encoded,thyroid_function_clinical_hyperthyroidism_one_hot_encoded,thyroid_function_clinical_hypothyroidism_one_hot_encoded,thyroid_function_euthyroid_one_hot_encoded,thyroid_function_subclinical_hyperthyroidism_one_hot_encoded,thyroid_function_subclinical_hypothyroidism_one_hot_encoded
0,0,False,False,False,True,False,False,True,False,False,...,True,False,False,True,False,False,False,True,False,False
1,0,False,True,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,True,False,False
2,0,False,False,False,False,True,False,False,True,False,...,False,False,False,True,False,False,False,True,False,False
3,0,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
4,0,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,2,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
379,2,False,True,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
380,2,False,True,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
381,2,False,True,False,False,False,False,False,False,False,...,False,True,True,False,False,True,False,False,False,False


# Label Encoding

In [15]:
# Create a copy of the data to avoid modifying the original
df_encoded = df.copy()

# Initialize a label encoder and create a dictionary to store encoded columns
label_encoder = LabelEncoder()
label_encoded_columns = {}  # To store only the newly encoded columns

# Loop through each column in non_binary_column_names
for column in non_binary_column_names:
    # Only encode columns of object (categorical) type
    if df_encoded[column].dtype == 'object':
        df_encoded[column] = label_encoder.fit_transform(df_encoded[column])
        label_encoded_columns[column] = df_encoded[column]  # Save the encoded column

# Create a dataframe with only the newly encoded columns
label_encoded_df = pd.DataFrame(label_encoded_columns)

label_encoded_df

Unnamed: 0,physical_examination,age_group,t,adenopathy,risk,stage_4cat,response_3cat,stage,t_4cat,pathology,n,response,thyroid_function_3cat,thyroid_function
0,3,1,0,3,2,0,1,0,0,2,0,2,1,2
1,1,2,0,3,2,0,0,0,0,2,0,1,1,2
2,4,2,0,3,2,0,0,0,0,2,0,1,1,2
3,4,5,0,3,2,0,0,0,0,2,0,1,1,2
4,1,5,0,3,2,0,0,0,0,2,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,4,6,6,5,0,3,2,4,3,3,2,0,1,2
379,1,7,6,1,0,3,2,4,3,3,2,3,1,2
380,1,6,6,0,0,3,2,4,3,3,2,3,1,2
381,1,5,6,1,0,3,2,3,3,1,2,3,0,0


In [16]:
# add suffix to columns 
label_encoded_df = MyCustDataSciLib.add_suffix_to_columns(label_encoded_df,'_label_encoded')
label_encoded_df

Unnamed: 0,physical_examination_label_encoded,age_group_label_encoded,t_label_encoded,adenopathy_label_encoded,risk_label_encoded,stage_4cat_label_encoded,response_3cat_label_encoded,stage_label_encoded,t_4cat_label_encoded,pathology_label_encoded,n_label_encoded,response_label_encoded,thyroid_function_3cat_label_encoded,thyroid_function_label_encoded
0,3,1,0,3,2,0,1,0,0,2,0,2,1,2
1,1,2,0,3,2,0,0,0,0,2,0,1,1,2
2,4,2,0,3,2,0,0,0,0,2,0,1,1,2
3,4,5,0,3,2,0,0,0,0,2,0,1,1,2
4,1,5,0,3,2,0,0,0,0,2,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,4,6,6,5,0,3,2,4,3,3,2,0,1,2
379,1,7,6,1,0,3,2,4,3,3,2,3,1,2
380,1,6,6,0,0,3,2,4,3,3,2,3,1,2
381,1,5,6,1,0,3,2,3,3,1,2,3,0,0


# Concat new encoded df

In [22]:
combined_encoded_df = pd.concat([binary_dummy_encoded_df, one_hot_encoded_df, label_encoded_df], axis=1)
combined_encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 88 columns):
 #   Column                                                            Non-Null Count  Dtype
---  ------                                                            --------------  -----
 0   gender_m_dummy_encoded                                            383 non-null    bool 
 1   smoking_yes_dummy_encoded                                         383 non-null    bool 
 2   hx_smoking_yes_dummy_encoded                                      383 non-null    bool 
 3   hx_radiotherapy_yes_dummy_encoded                                 383 non-null    bool 
 4   thyroid_function_2cat_normal_dummy_encoded                        383 non-null    bool 
 5   focality_uni-focal_dummy_encoded                                  383 non-null    bool 
 6   m_m1_dummy_encoded                                                383 non-null    bool 
 7   stage_2cat_late_dummy_encoded                        

# Export Clean Data

In [23]:
# double check one last time before exporting
combined_encoded_df.head()

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded,kmode_cluster_3cat_one_hot_encoded,...,risk_label_encoded,stage_4cat_label_encoded,response_3cat_label_encoded,stage_label_encoded,t_4cat_label_encoded,pathology_label_encoded,n_label_encoded,response_label_encoded,thyroid_function_3cat_label_encoded,thyroid_function_label_encoded
0,False,False,False,False,True,True,False,False,False,0,...,2,0,1,0,0,2,0,2,1,2
1,False,False,True,False,True,True,False,False,False,0,...,2,0,0,0,0,2,0,1,1,2
2,False,False,False,False,True,True,False,False,False,0,...,2,0,0,0,0,2,0,1,1,2
3,False,False,False,False,True,True,False,False,False,0,...,2,0,0,0,0,2,0,1,1,2
4,False,False,False,False,True,False,False,False,False,0,...,2,0,0,0,0,2,0,1,1,2


In [24]:
# double check one last time before exporting
combined_encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 88 columns):
 #   Column                                                            Non-Null Count  Dtype
---  ------                                                            --------------  -----
 0   gender_m_dummy_encoded                                            383 non-null    bool 
 1   smoking_yes_dummy_encoded                                         383 non-null    bool 
 2   hx_smoking_yes_dummy_encoded                                      383 non-null    bool 
 3   hx_radiotherapy_yes_dummy_encoded                                 383 non-null    bool 
 4   thyroid_function_2cat_normal_dummy_encoded                        383 non-null    bool 
 5   focality_uni-focal_dummy_encoded                                  383 non-null    bool 
 6   m_m1_dummy_encoded                                                383 non-null    bool 
 7   stage_2cat_late_dummy_encoded                        

In [25]:
# Save the cleaned DataFrame to a new CSV file
combined_encoded_df.to_csv('4_Thyroid_Disease_FeatureEngineer.csv', index=False)

# Decision

The main thing is to use 'recurrence' column as the main labeling feature for the modeling process.
