In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import GradientBoostingClassifier


In [20]:
df_big = pd.read_csv("data/dataset_med.csv")


In [21]:
df = df_big.sample(n=5000, random_state=42).reset_index(False)
df.head()

Unnamed: 0,index,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,773684,773685,37.0,Male,Lithuania,2015-09-30,Stage II,No,Current Smoker,34.5,241,0,0,0,0,Surgery,2017-05-16,0
1,278119,278120,63.0,Female,Hungary,2024-04-01,Stage III,No,Passive Smoker,22.2,162,1,1,0,0,Combined,2025-12-10,0
2,810422,810423,63.0,Female,Belgium,2015-05-08,Stage III,No,Former Smoker,22.8,230,0,0,1,0,Combined,2016-11-23,1
3,443587,443588,71.0,Male,Denmark,2014-10-05,Stage II,No,Never Smoked,32.1,293,0,0,0,0,Chemotherapy,2016-06-19,1
4,701478,701479,45.0,Female,Cyprus,2015-07-05,Stage I,No,Current Smoker,29.0,173,1,0,0,0,Surgery,2017-01-31,0


In [22]:
df['start_date']= pd.to_datetime(df['diagnosis_date'])
df['end_date']= pd.to_datetime(df['end_treatment_date'])

df['treatment_duration'] = df['end_date'] - df['start_date']


In [23]:
df.drop(['diagnosis_date','end_treatment_date','start_date','end_date', 'country'], axis=1, inplace=True)


In [24]:
df.head()

Unnamed: 0,index,id,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,773684,773685,37.0,Male,Stage II,No,Current Smoker,34.5,241,0,0,0,0,Surgery,0,594 days
1,278119,278120,63.0,Female,Stage III,No,Passive Smoker,22.2,162,1,1,0,0,Combined,0,618 days
2,810422,810423,63.0,Female,Stage III,No,Former Smoker,22.8,230,0,0,1,0,Combined,1,565 days
3,443587,443588,71.0,Male,Stage II,No,Never Smoked,32.1,293,0,0,0,0,Chemotherapy,1,623 days
4,701478,701479,45.0,Female,Stage I,No,Current Smoker,29.0,173,1,0,0,0,Surgery,0,576 days


In [25]:
df['treatment_duration']=df['treatment_duration'].dt.days

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               5000 non-null   int64  
 1   id                  5000 non-null   int64  
 2   age                 5000 non-null   float64
 3   gender              5000 non-null   object 
 4   cancer_stage        5000 non-null   object 
 5   family_history      5000 non-null   object 
 6   smoking_status      5000 non-null   object 
 7   bmi                 5000 non-null   float64
 8   cholesterol_level   5000 non-null   int64  
 9   hypertension        5000 non-null   int64  
 10  asthma              5000 non-null   int64  
 11  cirrhosis           5000 non-null   int64  
 12  other_cancer        5000 non-null   int64  
 13  treatment_type      5000 non-null   object 
 14  survived            5000 non-null   int64  
 15  treatment_duration  5000 non-null   int64  
dtypes: flo

In [27]:
df.head()

Unnamed: 0,index,id,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,773684,773685,37.0,Male,Stage II,No,Current Smoker,34.5,241,0,0,0,0,Surgery,0,594
1,278119,278120,63.0,Female,Stage III,No,Passive Smoker,22.2,162,1,1,0,0,Combined,0,618
2,810422,810423,63.0,Female,Stage III,No,Former Smoker,22.8,230,0,0,1,0,Combined,1,565
3,443587,443588,71.0,Male,Stage II,No,Never Smoked,32.1,293,0,0,0,0,Chemotherapy,1,623
4,701478,701479,45.0,Female,Stage I,No,Current Smoker,29.0,173,1,0,0,0,Surgery,0,576


In [28]:
y = df['survived']  #target variable

In [29]:
df.drop('survived',axis=1, inplace=True) #training data features

In [30]:
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split




In [31]:


num_features = ['age', 'bmi', 'cholesterol_level']
cat_features = ['gender', 'cancer_stage', 'family_history', 'smoking_status','treatment_type']

print(num_features)
print(cat_features)

['age', 'bmi', 'cholesterol_level']
['gender', 'cancer_stage', 'family_history', 'smoking_status', 'treatment_type']


In [32]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ], remainder='passthrough'
)



In [33]:


#y = preprocessor.transform(y)

X = df.drop(columns = ['id', 'treatment_duration'], axis =1)
X.shape 
#y.shape

(5000, 13)

In [34]:
y.value_counts()

class_weight = {0:1, 1:3}

In [39]:
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as ImbPipeline

sm = SMOTETomek(sampling_strategy=0.75)


X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)


X_train.shape, X_test.shape

X_train = preprocessor.fit_transform(X_train)

X_test = preprocessor.transform(X_test)

X_train, y_train = sm.fit_resample(X_train,y_train)

y_train.value_counts()

survived
0    2191
1    1465
Name: count, dtype: int64

In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    confusion_Matrix = confusion_matrix(true, predicted)
    return accuracy, precision, recall, f1 , confusion_Matrix
    


In [53]:
from sklearn.calibration import CalibratedClassifierCV

lc = LogisticRegression()
calibrated_model = CalibratedClassifierCV(lc, method='isotonic', cv=5)

calibrated_model.fit(X_train, y_train)
y_proba = calibrated_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, y_proba)

print(roc_auc)

0.5028228355520151


In [None]:
models = {
    "Logistic Regression": LogisticRegression(class_weight=class_weight),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(class_weight=class_weight),
    "Random Forest": RandomForestClassifier(class_weight=class_weight),
    "Support Vector Classifier": SVC(probability=True),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "GradientBoost Classifier": GradientBoostingClassifier()
}

#cv = KFold(5,random_state=None, shuffle=False)
params = {
    "Logistic Regression": {
        'classifier__penalty': ['l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['lbfgs', 'liblinear'],
        'classifier__class_weight': ['balanced', {0: 1, 1: 3}, {0: 1, 1: 5}]
    },
    "K-Nearest Neighbors": {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    },
    "Decision Tree": {
        'classifier__max_depth': [3, 5, 10, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__class_weight': ['balanced', {0: 1, 1: 3}, {0: 1, 1: 5}]
    },
    "Random Forest": {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [5, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__class_weight': ['balanced', {0: 1, 1: 3}]
    },
    "Support Vector Classifier": {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['rbf', 'linear'],
        'classifier__class_weight': ['balanced']
    },
    "XGBoost Classifier": {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__scale_pos_weight': [1, 3, 5]
    },
    "CatBoost Classifier": {
        'classifier__iterations': [100],
        'classifier__depth': [4, 6],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__scale_pos_weight': [1, 3]
    },
    "AdaBoost Classifier": {
        'classifier__n_estimators': [50, 100],
        'classifier__learning_rate': [0.01, 0.1]
    },
    "GradientBoost Classifier": {
        'classifier__n_estimators': [100],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__subsample': [0.8, 1.0]
    }
}



model_list = []
f1_list = []



for name, model in  models.items():
    # model = list(models.values())[i]
    # model_name = list(models.keys())[i]
    pipe = ImbPipeline(steps=[
        ("preprocessor" , preprocessor ),
        ("Smote", SMOTETomek(sampling_strategy=0.75)),
        ("classifier", model)
    ])
    clf = GridSearchCV(pipe, params[name], scoring='f1_macro', cv=StratifiedKFold(5) )
    clf.fit(X_train,y_train)

    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    train_accuracy, train_precision, train_recall, train_f1, train_conf = evaluate_model(y_train, y_train_pred)
    test_accuracy, test_precision, test_recall, test_f1, test_conf = evaluate_model(y_test, y_test_pred)


    print(name)  # Print model name
    model_list.append(name)

    print("confusion Mat Train: ",train_conf)
    print("confusion Mat Test: ",test_conf)
    print("*"*35)
    print("f1 train: ",train_f1)
    print("f1 test: ",test_f1)

    # print('Model performance for Training set')
    # print("- Accuracy: {:.4f}".format(train_accuracy))
    # print("- Precision: {:.4f}".format(train_precision))
    # print("- Recall: {:.4f}".format(train_recall))
    # print("- F1 Score: {:.4f}".format(train_f1))

    # print('----------------------------------')

    # print('Model performance for Test set')
    # print("- Accuracy: {:.4f}".format(test_accuracy))
    # print("- Precision: {:.4f}".format(test_precision))
    # print("- Recall: {:.4f}".format(test_recall))
    # print("- F1 Score: {:.4f}".format(test_f1))

    f1_list.append(test_f1) 
    
    print('='*35)
    print('\n')


Logistic Regression
confusion Mat Train:  [[2901    0]
 [ 849    0]]
confusion Mat Test:  [[967   0]
 [283   0]]
***********************************
f1 train:  0.0
f1 test:  0.0


K-Nearest Neighbors
confusion Mat Train:  [[2222  679]
 [ 384  465]]
confusion Mat Test:  [[644 323]
 [196  87]]
***********************************
f1 train:  0.46663321625689913
f1 test:  0.2510822510822511


Decision Tree
confusion Mat Train:  [[2229  672]
 [ 611  238]]
confusion Mat Test:  [[732 235]
 [218  65]]
***********************************
f1 train:  0.2706083001705515
f1 test:  0.22298456260720412


Random Forest
confusion Mat Train:  [[2731  170]
 [ 344  505]]
confusion Mat Test:  [[857 110]
 [245  38]]
***********************************
f1 train:  0.6627296587926509
f1 test:  0.1763341067285383




In [20]:
# needs restructuring since the data is big to train on local