# Predictive Thyroid Disease Project - Feature Engineering

To make a predictive model where you input factors of a person and have it predict if they are likely to have thyroid disease or not. The accuracy has to be above 95% to be considered successful.

# Initial Set Up

In [1]:
# General Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import datetime
import math

# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [2]:
# Load your cleaned CSV file
df = pd.read_csv('Thyroid_Disease_DataWrangled.csv') # Change it to Thyroid_Disease_EDA

# Change binary values into 0s and 1s

In [3]:
# check the columns data type before edit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   age                   383 non-null    int64 
 1   gender                383 non-null    object
 2   smoking               383 non-null    object
 3   hx_smoking            383 non-null    object
 4   hx_radiotherapy       383 non-null    object
 5   thyroid_function      383 non-null    object
 6   physical_examination  383 non-null    object
 7   adenopathy            383 non-null    object
 8   pathology             383 non-null    object
 9   focality              383 non-null    object
 10  risk                  383 non-null    object
 11  t                     383 non-null    object
 12  n                     383 non-null    object
 13  m                     383 non-null    object
 14  stage                 383 non-null    object
 15  response              383 non-null    ob

In [4]:
# check which columns are possible candidates to be converted into binary 0s and 1s
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_info

[('gender', 'f', 'm'),
 ('smoking', 'no', 'yes'),
 ('hx_smoking', 'no', 'yes'),
 ('hx_radiotherapy', 'no', 'yes'),
 ('focality', 'uni-focal', 'multi-focal'),
 ('m', 'm0', 'm1'),
 ('recurrence', 'no', 'yes')]

In [5]:
# Convert these 'yes' 'no columns into binary '1s' '0s' as to make future data science tasks easier
    # reminder
        # 2nd parameter is col name
        # 3rd parameter is the value to be converted to 1
        # 4th parameter is the value to be converted to 0

df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'gender', 'm', 'f')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'smoking', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'hx_smoking', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'hx_radiotherapy', 'yes', 'no')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'focality', 'uni-focal', 'multi-focal')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'm', 'm1', 'm0')
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'recurrence', 'yes', 'no')

# testing the function
print()
df, incompatible_rows_info, incompatible_rows = MyCustDataSciLib.convert_column_to_binary(df, 'adenopathy', 'yes', 'no')
    # applied function on 'Adenopathy' as want to test out the function 

gender column has been fully converted
smoking column has been fully converted
hx_smoking column has been fully converted
hx_radiotherapy column has been fully converted
focality column has been fully converted
m column has been fully converted
recurrence column has been fully converted

adenopathy column has 106 incompatible rows


In [6]:
binary_column_info, binary_column_names = MyCustDataSciLib.get_binary_columns(df, 'all')
binary_column_info

[('gender', '0', '1'),
 ('smoking', '0', '1'),
 ('hx_smoking', '0', '1'),
 ('hx_radiotherapy', '0', '1'),
 ('focality', '1', '0'),
 ('m', '0', '1'),
 ('recurrence', '0', '1')]

In [7]:
 # see all the binary columns has their dtypes converted to int32
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   age                   383 non-null    int64 
 1   gender                383 non-null    int32 
 2   smoking               383 non-null    int32 
 3   hx_smoking            383 non-null    int32 
 4   hx_radiotherapy       383 non-null    int32 
 5   thyroid_function      383 non-null    object
 6   physical_examination  383 non-null    object
 7   adenopathy            383 non-null    object
 8   pathology             383 non-null    object
 9   focality              383 non-null    int32 
 10  risk                  383 non-null    object
 11  t                     383 non-null    object
 12  n                     383 non-null    object
 13  m                     383 non-null    int32 
 14  stage                 383 non-null    object
 15  response              383 non-null    ob