# Predictive Thyroid Disease Project - Modeling

To make a predictive model for thyroid cancer patients to see how likely after treatment their cancer will reoccur.

'Response to treatment' is considered if the cancer comes back or not

# Initial Set Up

In [1]:
# General Data Manipulation
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter


# Import my custom library 
import MyCustDataSciLib as MyCustDataSciLib

In [2]:
# Load training CSV file
df_train = pd.read_csv('4_Thyroid_Disease_FeatureEngineer_train.csv')

df_test = pd.read_csv('4_Thyroid_Disease_FeatureEngineer_test.csv') 

In [3]:
df_train.head()

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded,kmode_cluster_3cat_one_hot_encoded,...,physical_examination_label_encoded,response_label_encoded,adenopathy_label_encoded,pathology_label_encoded,risk_label_encoded,stage_label_encoded,thyroid_function_label_encoded,thyroid_function_3cat_label_encoded,age_group_label_encoded,t_4cat_label_encoded
0,False,False,False,False,True,False,False,False,False,1,...,1,1,3,3,2,0,2,1,3,1
1,False,False,False,False,True,False,False,False,True,2,...,1,3,5,3,1,1,2,1,5,2
2,False,False,False,False,True,True,False,False,False,1,...,1,2,3,3,2,0,2,1,1,1
3,False,False,False,False,True,False,False,False,False,1,...,3,1,3,1,2,0,2,1,2,1
4,False,False,False,False,True,True,False,False,True,2,...,3,3,2,3,1,0,2,1,2,1


In [4]:
df_test.head()

Unnamed: 0,gender_m_dummy_encoded,smoking_yes_dummy_encoded,hx_smoking_yes_dummy_encoded,hx_radiotherapy_yes_dummy_encoded,thyroid_function_2cat_normal_dummy_encoded,focality_uni-focal_dummy_encoded,m_m1_dummy_encoded,stage_2cat_late_dummy_encoded,recurrence_yes_dummy_encoded,kmode_cluster_3cat_one_hot_encoded,...,physical_examination_label_encoded,response_label_encoded,adenopathy_label_encoded,pathology_label_encoded,risk_label_encoded,stage_label_encoded,thyroid_function_label_encoded,thyroid_function_3cat_label_encoded,age_group_label_encoded,t_4cat_label_encoded
0,False,False,False,False,True,True,False,False,False,0,...,3,1,3,3,2,0,2,1,2,2
1,False,False,False,False,False,True,False,False,False,0,...,2,1,5,0,2,0,4,2,2,2
2,False,False,False,False,True,False,False,False,True,2,...,1,3,0,3,1,0,2,1,2,2
3,False,False,False,False,True,True,False,False,True,2,...,3,3,3,0,1,0,2,1,4,2
4,False,False,True,False,True,True,False,False,False,1,...,4,1,3,3,2,0,2,1,3,0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 88 columns):
 #   Column                                                            Non-Null Count  Dtype
---  ------                                                            --------------  -----
 0   gender_m_dummy_encoded                                            306 non-null    bool 
 1   smoking_yes_dummy_encoded                                         306 non-null    bool 
 2   hx_smoking_yes_dummy_encoded                                      306 non-null    bool 
 3   hx_radiotherapy_yes_dummy_encoded                                 306 non-null    bool 
 4   thyroid_function_2cat_normal_dummy_encoded                        306 non-null    bool 
 5   focality_uni-focal_dummy_encoded                                  306 non-null    bool 
 6   m_m1_dummy_encoded                                                306 non-null    bool 
 7   stage_2cat_late_dummy_encoded                        

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 88 columns):
 #   Column                                                            Non-Null Count  Dtype
---  ------                                                            --------------  -----
 0   gender_m_dummy_encoded                                            77 non-null     bool 
 1   smoking_yes_dummy_encoded                                         77 non-null     bool 
 2   hx_smoking_yes_dummy_encoded                                      77 non-null     bool 
 3   hx_radiotherapy_yes_dummy_encoded                                 77 non-null     bool 
 4   thyroid_function_2cat_normal_dummy_encoded                        77 non-null     bool 
 5   focality_uni-focal_dummy_encoded                                  77 non-null     bool 
 6   m_m1_dummy_encoded                                                77 non-null     bool 
 7   stage_2cat_late_dummy_encoded                          

In [7]:
df_train.shape

(306, 88)

In [8]:
df_test.shape

(77, 88)

In [9]:
# Define features and target
X_train = df_train.drop('recurrence_yes_dummy_encoded', axis=1)
y_train = df_train['recurrence_yes_dummy_encoded']
X_test = df_test.drop('recurrence_yes_dummy_encoded', axis=1)
y_test = df_test['recurrence_yes_dummy_encoded']

# Address Data Imbalance

In [10]:
# check if balanced 
Counter(X_train)
Counter(y_train)

Counter({False: 217, True: 89})

In [11]:
# Address data imbalance with SMOTE
smote = SMOTE()
X_train_SMOTE, y_train_SMOTE = smote.fit_resample(X_train, y_train)

In [12]:
# check if balanced
Counter(X_train_SMOTE)
Counter(y_train_SMOTE)

Counter({False: 217, True: 217})

# Model knowledge

Reminder

Precision
- True Positives. Avoiding false alarms. It evaluates the quality of positive predictions made by the model
- Higher precision to recall ratio requires making stricter predictions, which may result in missing some positives (lower recall).

Recall
- Ensuring that no actual positives are missed. It evaluates the coverage of actual positives by the model.
- Higher recall to precision ration requires casting a wider net, which may result in more false positives (lower precision).

F1-Score
- an algorithm to balance Precision and Recall into one value

Support
- Number of true instances

accuracy
- correct prediction / total prediction


# Model 1 - Decision Tree

In [13]:
# Decision Tree 
dt = DecisionTreeClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2, class_weight='balanced')
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))

Decision Tree:
              precision    recall  f1-score   support

       False       0.98      0.93      0.96        58
        True       0.82      0.95      0.88        19

    accuracy                           0.94        77
   macro avg       0.90      0.94      0.92        77
weighted avg       0.94      0.94      0.94        77



In [14]:
# The model performs very well overall, with high precision, recall, and F1-scores for both classes.
# The slight imbalance in class distribution (58 False vs. 19 True) might make the metrics slightly biased towards the majority class, but this doesn’t seem significant here.

' \nThe model performs very well overall, with high precision, recall, and F1-scores for both classes.\nhe slight imbalance in class distribution (58 False vs. 19 True) might make the metrics slightly biased towards the majority class, but this doesn’t seem significant here.\n'

In [15]:
# Decision Tree with SMOTE to oversample the minority class
dt = DecisionTreeClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2)
dt.fit(X_train_SMOTE, y_train_SMOTE)
y_pred_dt = dt.predict(X_test)
print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))

Decision Tree:
              precision    recall  f1-score   support

       False       0.96      0.95      0.96        58
        True       0.85      0.89      0.87        19

    accuracy                           0.94        77
   macro avg       0.91      0.92      0.91        77
weighted avg       0.94      0.94      0.94        77



In [16]:
# The model performs even better with the synthetic balanced data.

' \nThe model performs even better with the synthetic balanced data.\n'

# Model 2 - Random Forest

In [17]:
# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

Random Forest:
              precision    recall  f1-score   support

       False       0.98      0.98      0.98        58
        True       0.95      0.95      0.95        19

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77



In [None]:
# nice score overall and really like the precision recall ratio on both the false and true.

In [18]:
# Random Forest, giving more weight to the minority class using class_weight='balanced'
rf = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

Random Forest:
              precision    recall  f1-score   support

       False       0.98      0.98      0.98        58
        True       0.95      0.95      0.95        19

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77



In [None]:
# Setting class weight to balanced practically has no change overall. Ensemble method decision trees are really great as even natively can handle imbalance data well.

# Model 3 - Gradient Boosting

In [19]:
# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, subsample=0.8)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting:") 
print(classification_report(y_test, y_pred_gb))

Gradient Boosting:
              precision    recall  f1-score   support

       False       0.97      0.98      0.97        58
        True       0.94      0.89      0.92        19

    accuracy                           0.96        77
   macro avg       0.96      0.94      0.95        77
weighted avg       0.96      0.96      0.96        77



In [None]:
# pretty good scores but I wonder why for the true answers the ratio of precision and recall is high.

In [20]:
# Gradient Boosting, using SMOTE synthetic data to oversample the minority class
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, subsample=0.8)
gb.fit(X_train_SMOTE, y_train_SMOTE)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting:") 
print(classification_report(y_test, y_pred_gb))

Gradient Boosting:
              precision    recall  f1-score   support

       False       0.98      0.98      0.98        58
        True       0.95      0.95      0.95        19

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77



In [None]:
# the SMOTE balanced data definitely improved performance overall while balancing out the precision and recall ratios. 

# Final Decision

Accuracy for all models are above 95%.

Decide to go with the Random Forest Model as seems to have the a really high performance while balancign out the precision and recall ratios for both True and False answers, and did this even on the imbalanced data. 

Also the random forest method is future proof as easily scalable do to it's parallel nature with parallel hardware.