# Predictive Thyroid Disease Project - Feature Engineering

To make a predictive model if the thyroid cancer patient is likely to respond to the treatment or not.

# Initial Set Up

In [1]:
# General Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import datetime
import math

# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [2]:
# Load your cleaned CSV file
df = pd.read_csv('3_Thyroid_Disease_EDA.csv') # Change it to Thyroid_Disease_EDA

In [3]:
df.head()

Unnamed: 0,age,age_group,gender,smoking,hx_smoking,hx_radiotherapy,thyroid_function,thyroid_function_3cat,thyroid_function_2cat,physical_examination,...,t_4cat,n,m,stage,stage_4cat,stage_2cat,response,response_3cat,recurrence,kmode_cluster
0,27,20s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-left,...,t1,n0,m0,i,i,early,indeterminate,indeterminate,no,0
1,34,30s,f,no,yes,no,euthyroid,normal,normal,multinodular_goiter,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
2,30,30s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
3,62,60s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1,n0,m0,i,i,early,excellent,excellent,no,0
4,62,60s,f,no,no,no,euthyroid,normal,normal,multinodular_goiter,...,t1,n0,m0,i,i,early,excellent,excellent,no,0


In [4]:
# check the columns data type before edit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    383 non-null    int64 
 1   age_group              383 non-null    object
 2   gender                 383 non-null    object
 3   smoking                383 non-null    object
 4   hx_smoking             383 non-null    object
 5   hx_radiotherapy        383 non-null    object
 6   thyroid_function       383 non-null    object
 7   thyroid_function_3cat  383 non-null    object
 8   thyroid_function_2cat  383 non-null    object
 9   physical_examination   383 non-null    object
 10  adenopathy             383 non-null    object
 11  pathology              383 non-null    object
 12  focality               383 non-null    object
 13  risk                   383 non-null    object
 14  t                      383 non-null    object
 15  t_4cat                 

# Check for missing data

Did it in Data Wrangling but double check if any EDA process messed it up.

# Change binary values into 0s and 1s

In [5]:
# check which columns are possible candidates to be converted into binary 0s and 1s
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_info

[('gender', 'f', 'm'),
 ('smoking', 'no', 'yes'),
 ('hx_smoking', 'no', 'yes'),
 ('hx_radiotherapy', 'no', 'yes'),
 ('thyroid_function_2cat', 'normal', 'diseased'),
 ('focality', 'uni-focal', 'multi-focal'),
 ('m', 'm0', 'm1'),
 ('stage_2cat', 'early', 'late'),
 ('recurrence', 'no', 'yes')]

In [6]:
# Convert these 'yes' 'no columns into binary '1s' '0s' as to make future data science tasks easier
    # reminder
        # 2nd parameter is col name
        # 3rd parameter is the value to be converted to 1
        # 4th parameter is the value to be converted to 0

df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'gender', 'm', 'f')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'smoking', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'hx_smoking', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'hx_radiotherapy', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'thyroid_function_2cat', 'diseased', 'normal')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'focality', 'uni-focal', 'multi-focal')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'm', 'm1', 'm0')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'stage_2cat', 'late', 'early')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'recurrence', 'yes', 'no')

# testing the function
print()
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'adenopathy', 'yes', 'no')
    # applied function on 'Adenopathy' as want to test out the function 

gender column has been fully converted
smoking column has been fully converted
hx_smoking column has been fully converted
hx_radiotherapy column has been fully converted
thyroid_function_2cat column has been fully converted
focality column has been fully converted
m column has been fully converted
stage_2cat column has been fully converted
recurrence column has been fully converted

adenopathy column has 106 incompatible rows


In [7]:
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_info

[('gender', '0', '1'),
 ('smoking', '0', '1'),
 ('hx_smoking', '0', '1'),
 ('hx_radiotherapy', '0', '1'),
 ('thyroid_function_2cat', '0', '1'),
 ('focality', '1', '0'),
 ('m', '0', '1'),
 ('stage_2cat', '0', '1'),
 ('recurrence', '0', '1')]

In [8]:
 # see all the binary columns has their dtypes converted to int32
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    383 non-null    int64 
 1   age_group              383 non-null    object
 2   gender                 383 non-null    int32 
 3   smoking                383 non-null    int32 
 4   hx_smoking             383 non-null    int32 
 5   hx_radiotherapy        383 non-null    int32 
 6   thyroid_function       383 non-null    object
 7   thyroid_function_3cat  383 non-null    object
 8   thyroid_function_2cat  383 non-null    int32 
 9   physical_examination   383 non-null    object
 10  adenopathy             383 non-null    object
 11  pathology              383 non-null    object
 12  focality               383 non-null    int32 
 13  risk                   383 non-null    object
 14  t                      383 non-null    object
 15  t_4cat                 

# Export Clean Data

In [None]:
# double check one last time before exporting
df.head()

In [None]:
# double check one last time before exporting
df.info()

In [None]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('4_Thyroid_Disease_FeatureEngineer.csv', index=False)

# Decision

The main thing is to use 'recurrence' column as the main labeling feature for the modeling process.
