# Predictive Thyroid Disease Project - Feature Engineering

To make a predictive model for thyroid cancer patients to see how likely after treatment their cancer will reoccur.

# Initial Set Up

In [1]:
# General Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import datetime
import math

from sklearn.model_selection import train_test_split


# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [2]:
# Load your cleaned CSV file
df = pd.read_csv('3_Thyroid_Disease_EDA.csv') # Change it to Thyroid_Disease_EDA

In [3]:
df.head()

Unnamed: 0,age,age_group,gender,smoking,hx_smoking,hx_radiotherapy,thyroid_function,thyroid_function_3cat,thyroid_function_2cat,physical_examination,...,t_4cat,n,m,stage,stage_4cat,stage_2cat,response,response_3cat,recurrence,kmode_cluster_3cat
0,27,20s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-left,...,t1,n0,m0,i,i,early,indeterminate,indeterminate,no,0
1,34,30s,f,no,yes,no,euthyroid,normal,normal,multinodular_goiter,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
2,30,30s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
3,62,60s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
4,62,60s,f,no,no,no,euthyroid,normal,normal,multinodular_goiter,...,t1,n0,m0,i,i,early,excellent,excellent,no,0


In [4]:
# check the columns data type before edit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    383 non-null    int64 
 1   age_group              383 non-null    object
 2   gender                 383 non-null    object
 3   smoking                383 non-null    object
 4   hx_smoking             383 non-null    object
 5   hx_radiotherapy        383 non-null    object
 6   thyroid_function       383 non-null    object
 7   thyroid_function_3cat  383 non-null    object
 8   thyroid_function_2cat  383 non-null    object
 9   physical_examination   383 non-null    object
 10  adenopathy             383 non-null    object
 11  pathology              383 non-null    object
 12  focality               383 non-null    object
 13  risk                   383 non-null    object
 14  t                      383 non-null    object
 15  t_4cat                 

In [5]:
# again check for missing data, did it in Data Wrangling but double check if any EDA process messed it up.
df.isnull().sum()

age                      0
age_group                0
gender                   0
smoking                  0
hx_smoking               0
hx_radiotherapy          0
thyroid_function         0
thyroid_function_3cat    0
thyroid_function_2cat    0
physical_examination     0
adenopathy               0
pathology                0
focality                 0
risk                     0
t                        0
t_4cat                   0
n                        0
m                        0
stage                    0
stage_4cat               0
stage_2cat               0
response                 0
response_3cat            0
recurrence               0
kmode_cluster_3cat       0
dtype: int64

# Encoding Set Up

In [6]:
# check for binary columns 
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_names

['gender',
 'smoking',
 'hx_smoking',
 'hx_radiotherapy',
 'thyroid_function_2cat',
 'focality',
 'm',
 'stage_2cat',
 'recurrence']

In [7]:
# make a list of non binary columns
all_columns = set(df.columns)
binary_columns_set = set(binary_column_names)
non_binary_column_names = list(all_columns - binary_columns_set)

non_binary_column_names

['stage_4cat',
 't',
 'response_3cat',
 'n',
 'age',
 'physical_examination',
 'response',
 'adenopathy',
 'pathology',
 'risk',
 'stage',
 'thyroid_function',
 'thyroid_function_3cat',
 'age_group',
 'kmode_cluster_3cat',
 't_4cat']

# Dummy Encoding

binary values into bools 0s and 1s

In [8]:
binary_dummy_encoded_df = pd.get_dummies(df[binary_column_names], drop_first=True)

binary_dummy_encoded_df

Unnamed: 0,gender_m,smoking_yes,hx_smoking_yes,hx_radiotherapy_yes,thyroid_function_2cat_normal,focality_uni-focal,m_m1,stage_2cat_late,recurrence_yes
0,False,False,False,False,True,True,False,False,False
1,False,False,True,False,True,True,False,False,False
2,False,False,False,False,True,True,False,False,False
3,False,False,False,False,True,True,False,False,False
4,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...
378,True,True,True,True,True,True,True,True,True
379,True,True,False,True,True,False,True,True,True
380,True,True,True,False,True,False,True,True,True
381,True,True,True,True,False,False,False,True,True


In [9]:
# add suffix to columns 
binary_dummy_encoded_df = MyCustDataSciLib.add_suffix_to_columns(binary_dummy_encoded_df,'_dummy_encoded')
binary_dummy_encoded_df

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded
0,False,False,False,False,True,True,False,False,False
1,False,False,True,False,True,True,False,False,False
2,False,False,False,False,True,True,False,False,False
3,False,False,False,False,True,True,False,False,False
4,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...
378,True,True,True,True,True,True,True,True,True
379,True,True,False,True,True,False,True,True,True
380,True,True,True,False,True,False,True,True,True
381,True,True,True,True,False,False,False,True,True


# One Hot Encoding

Each category in a column will be it's own category


In [10]:
# make a new df of non binary columns 
one_hot_encoded_df = df[non_binary_column_names].drop(columns=['age'])
one_hot_encoded_df

Unnamed: 0,stage_4cat,t,response_3cat,n,physical_examination,response,adenopathy,pathology,risk,stage,thyroid_function,thyroid_function_3cat,age_group,kmode_cluster_3cat,t_4cat
0,i,t1a,indeterminate,n0,single_nodular_goiter-left,indeterminate,no,micropapillary,low,i,euthyroid,normal,20s,0,t1
1,i,t1a,excellent,n0,multinodular_goiter,excellent,no,micropapillary,low,i,euthyroid,normal,30s,0,t1
2,i,t1a,excellent,n0,single_nodular_goiter-right,excellent,no,micropapillary,low,i,euthyroid,normal,30s,0,t1
3,i,t1a,excellent,n0,single_nodular_goiter-right,excellent,no,micropapillary,low,i,euthyroid,normal,60s,0,t1
4,i,t1a,excellent,n0,multinodular_goiter,excellent,no,micropapillary,low,i,euthyroid,normal,60s,0,t1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,iv,t4b,negative,n1b,single_nodular_goiter-right,biochemical_incomplete,right,papillary,high,ivb,euthyroid,normal,70s,2,t4
379,iv,t4b,negative,n1b,multinodular_goiter,structural_incomplete,extensive,papillary,high,ivb,euthyroid,normal,80s,2,t4
380,iv,t4b,negative,n1b,multinodular_goiter,structural_incomplete,bilateral,papillary,high,ivb,euthyroid,normal,70s,2,t4
381,iv,t4b,negative,n1b,multinodular_goiter,structural_incomplete,extensive,hurthel_cell,high,iva,clinical_hyperthyroidism,clinical,60s,2,t4


In [11]:
# pd.get_dummies set drop_first=True for one hot encoding
one_hot_encoded_df = pd.get_dummies(one_hot_encoded_df, drop_first=False)
one_hot_encoded_df

Unnamed: 0,kmode_cluster_3cat,stage_4cat_i,stage_4cat_ii,stage_4cat_iii,stage_4cat_iv,t_t1a,t_t1b,t_t2,t_t3a,t_t3b,...,age_group_30s,age_group_40s,age_group_50s,age_group_60s,age_group_70s,age_group_80s,t_4cat_t1,t_4cat_t2,t_4cat_t3,t_4cat_t4
0,0,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,0,True,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
2,0,True,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
3,0,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
4,0,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,2,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
379,2,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
380,2,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
381,2,False,False,False,True,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [12]:
# add suffix to columns 
one_hot_encoded_df = MyCustDataSciLib.add_suffix_to_columns(one_hot_encoded_df,'_one_hot_encoded')
one_hot_encoded_df

Unnamed: 0,kmode_cluster_3cat_one_hot_encoded,stage_4cat_i_one_hot_encoded,stage_4cat_ii_one_hot_encoded,stage_4cat_iii_one_hot_encoded,stage_4cat_iv_one_hot_encoded,t_t1a_one_hot_encoded,t_t1b_one_hot_encoded,t_t2_one_hot_encoded,t_t3a_one_hot_encoded,t_t3b_one_hot_encoded,...,age_group_30s_one_hot_encoded,age_group_40s_one_hot_encoded,age_group_50s_one_hot_encoded,age_group_60s_one_hot_encoded,age_group_70s_one_hot_encoded,age_group_80s_one_hot_encoded,t_4cat_t1_one_hot_encoded,t_4cat_t2_one_hot_encoded,t_4cat_t3_one_hot_encoded,t_4cat_t4_one_hot_encoded
0,0,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,0,True,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
2,0,True,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
3,0,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
4,0,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,2,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
379,2,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
380,2,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
381,2,False,False,False,True,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True


# Label Encoding

In [13]:
# Create a copy of the data to avoid modifying the original
df_encoded = df.copy()

# Initialize a label encoder and create a dictionary to store encoded columns
label_encoder = LabelEncoder()
label_encoded_columns = {}  # To store only the newly encoded columns

# Loop through each column in non_binary_column_names
for column in non_binary_column_names:
    # Only encode columns of object (categorical) type
    if df_encoded[column].dtype == 'object':
        df_encoded[column] = label_encoder.fit_transform(df_encoded[column])
        label_encoded_columns[column] = df_encoded[column]  # Save the encoded column

# Create a dataframe with only the newly encoded columns
label_encoded_df = pd.DataFrame(label_encoded_columns)

label_encoded_df

Unnamed: 0,stage_4cat,t,response_3cat,n,physical_examination,response,adenopathy,pathology,risk,stage,thyroid_function,thyroid_function_3cat,age_group,t_4cat
0,0,0,1,0,3,2,3,2,2,0,2,1,1,0
1,0,0,0,0,1,1,3,2,2,0,2,1,2,0
2,0,0,0,0,4,1,3,2,2,0,2,1,2,0
3,0,0,0,0,4,1,3,2,2,0,2,1,5,0
4,0,0,0,0,1,1,3,2,2,0,2,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,3,6,2,2,4,0,5,3,0,4,2,1,6,3
379,3,6,2,2,1,3,1,3,0,4,2,1,7,3
380,3,6,2,2,1,3,0,3,0,4,2,1,6,3
381,3,6,2,2,1,3,1,1,0,3,0,0,5,3


In [14]:
# add suffix to columns 
label_encoded_df = MyCustDataSciLib.add_suffix_to_columns(label_encoded_df,'_label_encoded')
label_encoded_df

Unnamed: 0,stage_4cat_label_encoded,t_label_encoded,response_3cat_label_encoded,n_label_encoded,physical_examination_label_encoded,response_label_encoded,adenopathy_label_encoded,pathology_label_encoded,risk_label_encoded,stage_label_encoded,thyroid_function_label_encoded,thyroid_function_3cat_label_encoded,age_group_label_encoded,t_4cat_label_encoded
0,0,0,1,0,3,2,3,2,2,0,2,1,1,0
1,0,0,0,0,1,1,3,2,2,0,2,1,2,0
2,0,0,0,0,4,1,3,2,2,0,2,1,2,0
3,0,0,0,0,4,1,3,2,2,0,2,1,5,0
4,0,0,0,0,1,1,3,2,2,0,2,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,3,6,2,2,4,0,5,3,0,4,2,1,6,3
379,3,6,2,2,1,3,1,3,0,4,2,1,7,3
380,3,6,2,2,1,3,0,3,0,4,2,1,6,3
381,3,6,2,2,1,3,1,1,0,3,0,0,5,3


# Concat new encoded df

In [15]:
combined_encoded_df = pd.concat([binary_dummy_encoded_df, one_hot_encoded_df, label_encoded_df], axis=1)
combined_encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 88 columns):
 #   Column                                                            Non-Null Count  Dtype
---  ------                                                            --------------  -----
 0   gender_m_dummy_encoded                                            383 non-null    bool 
 1   smoking_yes_dummy_encoded                                         383 non-null    bool 
 2   hx_smoking_yes_dummy_encoded                                      383 non-null    bool 
 3   hx_radiotherapy_yes_dummy_encoded                                 383 non-null    bool 
 4   thyroid_function_2cat_normal_dummy_encoded                        383 non-null    bool 
 5   focality_uni-focal_dummy_encoded                                  383 non-null    bool 
 6   m_m1_dummy_encoded                                                383 non-null    bool 
 7   stage_2cat_late_dummy_encoded                        

# Train Test Split

In [21]:
train_df, test_df = train_test_split(combined_encoded_df, test_size=0.2, random_state=42)

In [22]:
train_df

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded,kmode_cluster_3cat_one_hot_encoded,...,physical_examination_label_encoded,response_label_encoded,adenopathy_label_encoded,pathology_label_encoded,risk_label_encoded,stage_label_encoded,thyroid_function_label_encoded,thyroid_function_3cat_label_encoded,age_group_label_encoded,t_4cat_label_encoded
165,False,False,False,False,True,False,False,False,False,1,...,1,1,3,3,2,0,2,1,3,1
321,False,False,False,False,True,False,False,False,True,2,...,1,3,5,3,1,1,2,1,5,2
220,False,False,False,False,True,True,False,False,False,1,...,1,2,3,3,2,0,2,1,1,1
94,False,False,False,False,True,False,False,False,False,1,...,3,1,3,1,2,0,2,1,2,1
232,False,False,False,False,True,True,False,False,True,2,...,3,3,2,3,1,0,2,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,False,False,False,False,True,True,False,False,False,0,...,3,1,3,3,2,0,2,1,5,0
106,False,False,False,False,True,True,False,False,False,1,...,1,1,3,3,2,0,2,1,1,1
270,False,False,False,False,True,True,False,False,False,0,...,3,2,3,3,2,0,2,1,2,2
348,False,False,False,False,False,False,False,False,True,2,...,1,3,1,3,1,1,4,2,4,2


In [23]:
test_df

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded,kmode_cluster_3cat_one_hot_encoded,...,physical_examination_label_encoded,response_label_encoded,adenopathy_label_encoded,pathology_label_encoded,risk_label_encoded,stage_label_encoded,thyroid_function_label_encoded,thyroid_function_3cat_label_encoded,age_group_label_encoded,t_4cat_label_encoded
268,False,False,False,False,True,True,False,False,False,0,...,3,1,3,3,2,0,2,1,2,2
250,False,False,False,False,False,True,False,False,False,0,...,2,1,5,0,2,0,4,2,2,2
318,False,False,False,False,True,False,False,False,True,2,...,1,3,0,3,1,0,2,1,2,2
331,False,False,False,False,True,True,False,False,True,2,...,3,3,3,0,1,0,2,1,4,2
56,False,False,True,False,True,True,False,False,False,1,...,4,1,3,3,2,0,2,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,False,False,False,False,False,True,False,False,False,1,...,4,1,5,3,2,0,1,0,2,1
167,False,False,False,False,True,True,False,False,False,1,...,3,1,3,3,2,0,2,1,4,1
63,False,False,False,False,True,True,False,False,False,0,...,1,2,3,3,2,0,2,1,4,0
233,False,False,False,False,True,True,False,False,True,2,...,4,3,5,3,1,0,2,1,2,1


# Export Clean Data

In [24]:
# explore the entire csv
combined_encoded_df.to_csv('4_Thyroid_Disease_FeatureEngineer.csv', index=False)

In [25]:
# export the train and test split of out total csv 

train_df.to_csv('4_Thyroid_Disease_FeatureEngineer_train.csv', index=False)

test_df.to_csv('4_Thyroid_Disease_FeatureEngineer_test.csv', index=False)