In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')


In [66]:
df = pd.read_csv("data/dataset_med.csv")


In [67]:
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [68]:
df['start_date']= pd.to_datetime(df['diagnosis_date'])
df['end_date']= pd.to_datetime(df['end_treatment_date'])

df['treatment_duration'] = df['end_date'] - df['start_date']


In [69]:
df.drop(['diagnosis_date','end_treatment_date','start_date','end_date'], axis=1, inplace=True)


In [70]:
X = df.drop(columns =['country', 'id'], axis =1)

In [71]:
X.head()

Unnamed: 0,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,64.0,Male,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,0,523 days
1,50.0,Female,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,1,424 days
2,65.0,Female,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,0,370 days
3,51.0,Female,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,0,443 days
4,37.0,Male,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,0,406 days


In [72]:
X['treatment_duration']=X['treatment_duration'].dt.days

In [73]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   age                 890000 non-null  float64
 1   gender              890000 non-null  object 
 2   cancer_stage        890000 non-null  object 
 3   family_history      890000 non-null  object 
 4   smoking_status      890000 non-null  object 
 5   bmi                 890000 non-null  float64
 6   cholesterol_level   890000 non-null  int64  
 7   hypertension        890000 non-null  int64  
 8   asthma              890000 non-null  int64  
 9   cirrhosis           890000 non-null  int64  
 10  other_cancer        890000 non-null  int64  
 11  treatment_type      890000 non-null  object 
 12  survived            890000 non-null  int64  
 13  treatment_duration  890000 non-null  int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 95.1+ MB


In [74]:
X.head()

Unnamed: 0,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,64.0,Male,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,0,523
1,50.0,Female,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,1,424
2,65.0,Female,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,0,370
3,51.0,Female,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,0,443
4,37.0,Male,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,0,406


In [75]:
y = X['survived']  #target variable

In [76]:
X.drop('survived',axis=1, inplace=True) #training data features

In [80]:


num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

print(num_features)
print(cat_features)

Index(['age', 'bmi', 'cholesterol_level', 'hypertension', 'asthma',
       'cirrhosis', 'other_cancer', 'treatment_duration'],
      dtype='object')
Index(['gender', 'cancer_stage', 'family_history', 'smoking_status',
       'treatment_type'],
      dtype='object')


In [81]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)



In [82]:
X = preprocessor.fit_transform(X)


X.shape

(890000, 24)

In [87]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.005,random_state=42)
X_train.shape, X_test.shape

((885550, 24), (4450, 24))

In [88]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1
    


In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Classifier": SVC(probability=True),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list = []
f1_list = []



for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy, train_precision, train_recall, train_f1 = evaluate_model(y_train, y_train_pred)
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])  # Print model name
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(train_accuracy))
    print("- Precision: {:.4f}".format(train_precision))
    print("- Recall: {:.4f}".format(train_recall))
    print("- F1 Score: {:.4f}".format(train_f1))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(test_accuracy))
    print("- Precision: {:.4f}".format(test_precision))
    print("- Recall: {:.4f}".format(test_recall))
    print("- F1 Score: {:.4f}".format(test_f1))

    f1_list.append(test_f1) 
    
    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy: 0.7798
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7708
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000


