# Predictive Thyroid Disease Project - Feature Engineering

To make a predictive model if the thyroid cancer patient is likely to respond to the treatment or not.

# Initial Set Up

In [2]:
# General Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import datetime
import math

# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [3]:
# Load your cleaned CSV file
df = pd.read_csv('Thyroid_Disease_DataWrangled.csv') # Change it to Thyroid_Disease_EDA

In [4]:
df.head()

Unnamed: 0,age,age_group,gender,smoking,hx_smoking,hx_radiotherapy,thyroid_function,thyroid_function_3cat,thyroid_function_2cat,physical_examination,...,t,t_4cat,n,m,stage,stage_4cat,stage_2cat,response,response_3cat,recurrence
0,27,20s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-left,...,t1a,t1,n0,m0,i,i,early,indeterminate,indeterminate,no
1,34,30s,f,no,yes,no,euthyroid,normal,normal,multinodular_goiter,...,t1a,t1,n0,m0,i,i,early,excellent,excellent,no
2,30,30s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1a,t1,n0,m0,i,i,early,excellent,excellent,no
3,62,60s,f,no,no,no,euthyroid,normal,normal,single_nodular_goiter-right,...,t1a,t1,n0,m0,i,i,early,excellent,excellent,no
4,62,60s,f,no,no,no,euthyroid,normal,normal,multinodular_goiter,...,t1a,t1,n0,m0,i,i,early,excellent,excellent,no


# Change binary values into 0s and 1s

In [3]:
# check the columns data type before edit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   age                   383 non-null    int64 
 1   gender                383 non-null    object
 2   smoking               383 non-null    object
 3   hx_smoking            383 non-null    object
 4   hx_radiotherapy       383 non-null    object
 5   thyroid_function      383 non-null    object
 6   physical_examination  383 non-null    object
 7   adenopathy            383 non-null    object
 8   pathology             383 non-null    object
 9   focality              383 non-null    object
 10  risk                  383 non-null    object
 11  t                     383 non-null    object
 12  n                     383 non-null    object
 13  m                     383 non-null    object
 14  stage                 383 non-null    object
 15  response              383 non-null    ob

In [4]:
# check which columns are possible candidates to be converted into binary 0s and 1s
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_info

[('gender', 'f', 'm'),
 ('smoking', 'no', 'yes'),
 ('hx_smoking', 'no', 'yes'),
 ('hx_radiotherapy', 'no', 'yes'),
 ('focality', 'uni-focal', 'multi-focal'),
 ('m', 'm0', 'm1'),
 ('recurrence', 'no', 'yes')]

In [5]:
# Convert these 'yes' 'no columns into binary '1s' '0s' as to make future data science tasks easier
    # reminder
        # 2nd parameter is col name
        # 3rd parameter is the value to be converted to 1
        # 4th parameter is the value to be converted to 0

df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'gender', 'm', 'f')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'smoking', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'hx_smoking', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'hx_radiotherapy', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'focality', 'uni-focal', 'multi-focal')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'm', 'm1', 'm0')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'recurrence', 'yes', 'no')

# testing the function
print()
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'adenopathy', 'yes', 'no')
    # applied function on 'Adenopathy' as want to test out the function 

gender column has been fully converted
smoking column has been fully converted
hx_smoking column has been fully converted
hx_radiotherapy column has been fully converted
focality column has been fully converted
m column has been fully converted
recurrence column has been fully converted

adenopathy column has 106 incompatible rows


In [6]:
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_info

[('gender', '0', '1'),
 ('smoking', '0', '1'),
 ('hx_smoking', '0', '1'),
 ('hx_radiotherapy', '0', '1'),
 ('focality', '1', '0'),
 ('m', '0', '1'),
 ('recurrence', '0', '1')]

In [7]:
 # see all the binary columns has their dtypes converted to int32
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   age                   383 non-null    int64 
 1   gender                383 non-null    int32 
 2   smoking               383 non-null    int32 
 3   hx_smoking            383 non-null    int32 
 4   hx_radiotherapy       383 non-null    int32 
 5   thyroid_function      383 non-null    object
 6   physical_examination  383 non-null    object
 7   adenopathy            383 non-null    object
 8   pathology             383 non-null    object
 9   focality              383 non-null    int32 
 10  risk                  383 non-null    object
 11  t                     383 non-null    object
 12  n                     383 non-null    object
 13  m                     383 non-null    int32 
 14  stage                 383 non-null    object
 15  response              383 non-null    ob

# Converting the "thyroid_function" column

In [10]:
MyCustDataSciLib.print_unique_values_summary(df, ['thyroid_function'])

Unique values and counts for column: 'thyroid_function'
              thyroid_function  count
0                    euthyroid    332
1     clinical_hyperthyroidism     20
2   subclinical_hypothyroidism     14
3      clinical_hypothyroidism     12
4  subclinical_hyperthyroidism      5
----------------------------------------
FUNCTION FINISHED, detected no columns with only 1 unique values which is good.


In [11]:
# the main objective is to make a model that can predict if you have thyroid disease or not.
# the column thyroid_function is this indicator but it has multiple values, but alot of these values are just different terms of thyroid disease.
# I thought about making a model to predict the different types of thyroid disease, but some of their data size is really small, as observed from checking the count of unique values in that column which was done in Data Wrangling step.
# thyroid function meaning
    # Euthyroid - Normal thyroid function. No signs of hyperthyroidism or hypothyroidism.
    # Hyperthyroidism - Thyroid produces excess thyroid hormones, leading to symptoms like weight loss, rapid heart rate, and anxiety.
    # Hypothyroidism - Thyroid does not produce enough hormones, leading to symptoms like fatigue, weight gain, and depression.
    # Clinical - The condition is symptomatic and clearly impacts the patient,
    # SubClinical - The condition is asymptomatic or mild, and abnormalities are only detected through lab tests (e.g., TSH levels). Hormone levels (T3, T4), may develop into clinical

In [12]:
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df,'thyroid_function',
                                                                                          ['clinical_hyperthyroidism','subclinical_hypothyroidism','clinical_hypothyroidism','subclinical_hyperthyroidism'],
                                                                                          'euthyroid')

thyroid_function column has been fully converted


In [13]:
MyCustDataSciLib.print_unique_values_summary(df, ['thyroid_function'])
# as you can see all the different types of thyroid diseases have been aggregate into only the number 1 

Unique values and counts for column: 'thyroid_function'
   thyroid_function  count
0                 0    332
1                 1     51
----------------------------------------
FUNCTION FINISHED, detected no columns with only 1 unique values which is good.
