# Predictive Thyroid Disease Project - Feature Engineering

To make a predictive model for thyroid cancer patients to see how likely after treatment their cancer will reoccur.

# Initial Set Up

In [1]:
# General Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import datetime
import math

# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [2]:
# Load your cleaned CSV file
df = pd.read_csv('3_Thyroid_Disease_EDA.csv') # Change it to Thyroid_Disease_EDA

In [3]:
df.head()

Unnamed: 0,age,age_group,gender,smoking,hx_smoking,hx_radiotherapy,thyroid_function,thyroid_function_3cat,thyroid_function_2cat,physical_examination,...,n,m,stage,stage_4cat,stage_2cat,response,response_3cat,recurrence,kmode_cluster,kmode_cluster_3cat
0,27,20s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-left,...,n0,m0,i,i,early,indeterminate,indeterminate,no,1,1
1,34,30s,f,no,yes,no,euthyroid,normal,normal,multinodular_goiter,...,n0,m0,i,i,early,excellent,excellent,no,1,1
2,30,30s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,n0,m0,i,i,early,excellent,excellent,no,1,1
3,62,60s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,n0,m0,i,i,early,excellent,excellent,no,1,1
4,62,60s,f,no,no,no,euthyroid,normal,normal,multinodular_goiter,...,n0,m0,i,i,early,excellent,excellent,no,1,1


In [4]:
# check the columns data type before edit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    383 non-null    int64 
 1   age_group              383 non-null    object
 2   gender                 383 non-null    object
 3   smoking                383 non-null    object
 4   hx_smoking             383 non-null    object
 5   hx_radiotherapy        383 non-null    object
 6   thyroid_function       383 non-null    object
 7   thyroid_function_3cat  383 non-null    object
 8   thyroid_function_2cat  383 non-null    object
 9   physical_examination   383 non-null    object
 10  adenopathy             383 non-null    object
 11  pathology              383 non-null    object
 12  focality               383 non-null    object
 13  risk                   383 non-null    object
 14  t                      383 non-null    object
 15  t_4cat                 

In [5]:
# again check for missing data, did it in Data Wrangling but double check if any EDA process messed it up.
df.isnull().sum()

age                      0
age_group                0
gender                   0
smoking                  0
hx_smoking               0
hx_radiotherapy          0
thyroid_function         0
thyroid_function_3cat    0
thyroid_function_2cat    0
physical_examination     0
adenopathy               0
pathology                0
focality                 0
risk                     0
t                        0
t_4cat                   0
n                        0
m                        0
stage                    0
stage_4cat               0
stage_2cat               0
response                 0
response_3cat            0
recurrence               0
kmode_cluster            0
kmode_cluster_3cat       0
dtype: int64

# Encoding Set Up

In [6]:
# check for binary columns 
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_names

['gender',
 'smoking',
 'hx_smoking',
 'hx_radiotherapy',
 'thyroid_function_2cat',
 'focality',
 'm',
 'stage_2cat',
 'recurrence']

In [7]:
# make a list of non binary columns
all_columns = set(df.columns)
binary_columns_set = set(binary_column_names)
non_binary_column_names = list(all_columns - binary_columns_set)

non_binary_column_names

['age',
 'n',
 'response',
 'adenopathy',
 't',
 'age_group',
 'risk',
 'response_3cat',
 'pathology',
 'thyroid_function_3cat',
 'thyroid_function',
 'kmode_cluster',
 'kmode_cluster_3cat',
 't_4cat',
 'stage_4cat',
 'physical_examination',
 'stage']

# Dummy Encoding

binary values into bools 0s and 1s

In [8]:
# pd.get_dummies will convert and replace your column, also set drop_first=True for dummy encoding
df = pd.get_dummies(df, columns=binary_column_names, drop_first=True)
df

Unnamed: 0,age,age_group,thyroid_function,thyroid_function_3cat,physical_examination,adenopathy,pathology,risk,t,t_4cat,...,kmode_cluster_3cat,gender_m,smoking_yes,hx_smoking_yes,hx_radiotherapy_yes,thyroid_function_2cat_normal,focality_uni-focal,m_m1,stage_2cat_late,recurrence_yes
0,27,20s,euthyroid,normal,single_nodular_goiter-left,no,micropapillary,low,t1a,t1,...,1,False,False,False,False,True,True,False,False,False
1,34,30s,euthyroid,normal,multinodular_goiter,no,micropapillary,low,t1a,t1,...,1,False,False,True,False,True,True,False,False,False
2,30,30s,euthyroid,normal,single_nodular_goiter-right,no,micropapillary,low,t1a,t1,...,1,False,False,False,False,True,True,False,False,False
3,62,60s,euthyroid,normal,single_nodular_goiter-right,no,micropapillary,low,t1a,t1,...,1,False,False,False,False,True,True,False,False,False
4,62,60s,euthyroid,normal,multinodular_goiter,no,micropapillary,low,t1a,t1,...,1,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,72,70s,euthyroid,normal,single_nodular_goiter-right,right,papillary,high,t4b,t4,...,2,True,True,True,True,True,True,True,True,True
379,81,80s,euthyroid,normal,multinodular_goiter,extensive,papillary,high,t4b,t4,...,2,True,True,False,True,True,False,True,True,True
380,72,70s,euthyroid,normal,multinodular_goiter,bilateral,papillary,high,t4b,t4,...,2,True,True,True,False,True,False,True,True,True
381,61,60s,clinical_hyperthyroidism,clinical,multinodular_goiter,extensive,hurthel_cell,high,t4b,t4,...,2,True,True,True,True,False,False,False,True,True


In [9]:
# see all the binary columns has their dtypes converted to bools
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   age                           383 non-null    int64 
 1   age_group                     383 non-null    object
 2   thyroid_function              383 non-null    object
 3   thyroid_function_3cat         383 non-null    object
 4   physical_examination          383 non-null    object
 5   adenopathy                    383 non-null    object
 6   pathology                     383 non-null    object
 7   risk                          383 non-null    object
 8   t                             383 non-null    object
 9   t_4cat                        383 non-null    object
 10  n                             383 non-null    object
 11  stage                         383 non-null    object
 12  stage_4cat                    383 non-null    object
 13  response            

# One Hot Encoding

Each category in a column will be it's own category


In [10]:
# pd.get_dummies set drop_first=True for one hot encoding
df = pd.get_dummies(df, columns=non_binary_column_names, drop_first=True)
df

Unnamed: 0,gender_m,smoking_yes,hx_smoking_yes,hx_radiotherapy_yes,thyroid_function_2cat_normal,focality_uni-focal,m_m1,stage_2cat_late,recurrence_yes,age_17,...,stage_4cat_iii,stage_4cat_iv,physical_examination_multinodular_goiter,physical_examination_normal,physical_examination_single_nodular_goiter-left,physical_examination_single_nodular_goiter-right,stage_ii,stage_iii,stage_iva,stage_ivb
0,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,False,False,True,False,True,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,True,True,True,True,True,True,True,True,True,False,...,False,True,False,False,False,True,False,False,False,True
379,True,True,False,True,True,False,True,True,True,False,...,False,True,True,False,False,False,False,False,False,True
380,True,True,True,False,True,False,True,True,True,False,...,False,True,True,False,False,False,False,False,False,True
381,True,True,True,True,False,False,False,True,True,False,...,False,True,True,False,False,False,False,False,True,False


# Export Clean Data

In [13]:
# double check one last time before exporting
df.head()

Unnamed: 0,gender_m,smoking_yes,hx_smoking_yes,hx_radiotherapy_yes,thyroid_function_2cat_normal,focality_uni-focal,m_m1,stage_2cat_late,recurrence_yes,age_17,...,stage_4cat_iii,stage_4cat_iv,physical_examination_multinodular_goiter,physical_examination_normal,physical_examination_single_nodular_goiter-left,physical_examination_single_nodular_goiter-right,stage_ii,stage_iii,stage_iva,stage_ivb
0,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,False,False,True,False,True,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [14]:
# double check one last time before exporting
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Columns: 127 entries, gender_m to stage_ivb
dtypes: bool(127)
memory usage: 47.6 KB


In [15]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('4_Thyroid_Disease_FeatureEngineer.csv', index=False)

# Decision

The main thing is to use 'recurrence' column as the main labeling feature for the modeling process.
