In [5]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ydata_profiling import ProfileReport

import warnings
warnings.filterwarnings("ignore")

import pickle
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
          ConfusionMatrixDisplay, confusion_matrix, classification_report, 
          accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve,f1_score)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


In [3]:
print(sys.executable)


d:\SWS\envs\ME\python.exe


In [6]:
# Read the dataset
df = pd.read_csv('../dataset/data.csv')
df.head()


Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [7]:
df.shape


(383, 17)

In [8]:
df.columns


Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')

In [9]:
df.dtypes


Age                      int64
Gender                  object
Smoking                 object
Hx Smoking              object
Hx Radiothreapy         object
Thyroid Function        object
Physical Examination    object
Adenopathy              object
Pathology               object
Focality                object
Risk                    object
T                       object
N                       object
M                       object
Stage                   object
Response                object
Recurred                object
dtype: object

# Data Preprocessing
## EDA(Exploratory Data Analysis)

In [10]:
profile = ProfileReport(df)
profile.to_file(output_file='../output/graph/data_profile.html')


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:00<00:00, 24.18it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
df[df.duplicated()]


Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
32,36,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
38,40,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
40,51,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
66,35,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T1b,N0,M0,I,Excellent,No
69,51,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Papillary,Uni-Focal,Low,T1b,N0,M0,I,Excellent,No
77,29,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T1b,N0,M0,I,Excellent,No
120,37,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
121,26,F,No,No,No,Euthyroid,Multinodular goiter,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
123,28,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
132,32,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No


In [12]:
df = df.drop_duplicates()


In [13]:
df.shape


(364, 17)

In [14]:
transformer = ColumnTransformer(
     transformers= [
          ('catagorical', OneHotEncoder(drop='first'), ['Gender','Thyroid Function','Physical Examination','Adenopathy','Pathology']),
          ('ordinal', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes'], ['No', 'Yes'],['Uni-Focal','Multi-Focal'], ['Low','Intermediate','High'], ['T1a', 'T1b', 'T2', 'T3a', 'T3b', 'T4a', 'T4b'],
                                                 ['N0','N1a','N1b'], ['M0','M1'],['I', 'II', 'IVB', 'III', 'IVA'], ['Excellent', 'Indeterminate', 'Biochemical Incomplete','Structural Incomplete']]),
                                                 ['Smoking','Hx Smoking','Hx Radiothreapy','Focality','Risk','T','N','M','Stage','Response'])
     ],
     remainder='passthrough'
)


In [15]:
X = df.drop("Recurred",axis=1)


In [16]:
y = df['Recurred'].map({"No": 0 , "Yes":1})


In [17]:
X_transformed = transformer.fit_transform(X)


In [18]:
transformed_feature_names = transformer.get_feature_names_out(input_features=X.columns)
print(transformed_feature_names)


['catagorical__Gender_M'
 'catagorical__Thyroid Function_Clinical Hypothyroidism'
 'catagorical__Thyroid Function_Euthyroid'
 'catagorical__Thyroid Function_Subclinical Hyperthyroidism'
 'catagorical__Thyroid Function_Subclinical Hypothyroidism'
 'catagorical__Physical Examination_Multinodular goiter'
 'catagorical__Physical Examination_Normal'
 'catagorical__Physical Examination_Single nodular goiter-left'
 'catagorical__Physical Examination_Single nodular goiter-right'
 'catagorical__Adenopathy_Extensive' 'catagorical__Adenopathy_Left'
 'catagorical__Adenopathy_No' 'catagorical__Adenopathy_Posterior'
 'catagorical__Adenopathy_Right' 'catagorical__Pathology_Hurthel cell'
 'catagorical__Pathology_Micropapillary'
 'catagorical__Pathology_Papillary' 'ordinal__Smoking'
 'ordinal__Hx Smoking' 'ordinal__Hx Radiothreapy' 'ordinal__Focality'
 'ordinal__Risk' 'ordinal__T' 'ordinal__N' 'ordinal__M' 'ordinal__Stage'
 'ordinal__Response' 'remainder__Age']


In [19]:
transformed_cols = ['Gender', 'Thyroid Function_Clinical Hypothyroidism',
 'Thyroid Function_Euthyroid',
 'Thyroid Function_Subclinical Hyperthyroidism',
 'Thyroid Function_Subclinical Hypothyroidism',
 'Physical Examination_Multinodular goiter',
 'Physical Examination_Normal',
 'Physical Examination_Single nodular goiter-left',
 'Physical Examination_Single nodular goiter-right',
 'Adenopathy_Extensive', 'Adenopathy_Left', 'Adenopathy_No',
 'Adenopathy_Posterior', 'Adenopathy_Right',
 'Pathology_Hurthel cell', 'Pathology_Micropapillary',
 'Pathology_Papillary', 'Smoking', 'Hx Smoking',
 'Hx Radiothreapy', 'Focality', 'Risk', 'T', 'N', 'M',
 'Stage', 'Response', 'Age']


In [20]:
transformed_df = pd.DataFrame(data=X_transformed,columns=transformed_cols)
transformed_df['Recurred'] = y
transformed_df.head()


Unnamed: 0,Gender,Thyroid Function_Clinical Hypothyroidism,Thyroid Function_Euthyroid,Thyroid Function_Subclinical Hyperthyroidism,Thyroid Function_Subclinical Hypothyroidism,Physical Examination_Multinodular goiter,Physical Examination_Normal,Physical Examination_Single nodular goiter-left,Physical Examination_Single nodular goiter-right,Adenopathy_Extensive,...,Hx Radiothreapy,Focality,Risk,T,N,M,Stage,Response,Age,Recurred
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,27.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0


In [21]:
X_train,X_test,y_train,y_test = train_test_split(X_transformed,y,test_size=0.20,random_state=1)


In [22]:
X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size=0.5,random_state=1)


# Model Train

In [64]:
scaler = StandardScaler()


In [65]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)


In [66]:
dt = DecisionTreeClassifier()


In [67]:
dt.fit(X_train_scaled,y_train)


In [68]:
y_pred_dt = dt.predict(X_test_scaled)


In [None]:
# save model
with open('../output/models/decision_tree_model_0.pkl', 'wb') as file:
    pickle.dump(dt, file)



In [None]:
# load model
with open('../output/models/decision_tree_model_0.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [32]:

predictions = loaded_model.predict(X_test_scaled)


In [28]:
def evaluate_model (y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test,y_pred)
    print("Accuracy of the model is: %.2f"%(accuracy * 100) , "%")
    print("Precision of the model is: %.2f" %(precision * 100) , "%")
    print("Recall of the model is: %.2f" %(recall * 100) , "%")
    print("AUC value of the model is: %.2f" %(roc * 100) , "%")
    print("F1 score of the model is: %.2f" %(f1 * 100) , "%")


In [62]:
evaluate_model(y_test, predictions)


Accuracy of the model is: 88.89 %
Precision of the model is: 83.33 %
Recall of the model is: 83.33 %
AUC value of the model is: 87.50 %
F1 score of the model is: 83.33 %


In [69]:
evaluate_model(y_test,y_pred_dt)


Accuracy of the model is: 88.89 %
Precision of the model is: 83.33 %
Recall of the model is: 83.33 %
AUC value of the model is: 87.50 %
F1 score of the model is: 83.33 %


In [70]:
print(classification_report(y_test,y_pred_dt))


              precision    recall  f1-score   support

           0       0.92      0.92      0.92        24
           1       0.83      0.83      0.83        12

    accuracy                           0.89        36
   macro avg       0.88      0.88      0.88        36
weighted avg       0.89      0.89      0.89        36



In [71]:
labels = ['norecurreance', 'recurreance']


In [73]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_dt)


<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x264d0071f10>