In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')


In [2]:
df_big = pd.read_csv("data/dataset_med.csv")


In [3]:
df = df_big.sample(n=5000, random_state=42).reset_index(False)
df.head()

Unnamed: 0,index,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,773684,773685,37.0,Male,Lithuania,2015-09-30,Stage II,No,Current Smoker,34.5,241,0,0,0,0,Surgery,2017-05-16,0
1,278119,278120,63.0,Female,Hungary,2024-04-01,Stage III,No,Passive Smoker,22.2,162,1,1,0,0,Combined,2025-12-10,0
2,810422,810423,63.0,Female,Belgium,2015-05-08,Stage III,No,Former Smoker,22.8,230,0,0,1,0,Combined,2016-11-23,1
3,443587,443588,71.0,Male,Denmark,2014-10-05,Stage II,No,Never Smoked,32.1,293,0,0,0,0,Chemotherapy,2016-06-19,1
4,701478,701479,45.0,Female,Cyprus,2015-07-05,Stage I,No,Current Smoker,29.0,173,1,0,0,0,Surgery,2017-01-31,0


In [4]:
df['start_date']= pd.to_datetime(df['diagnosis_date'])
df['end_date']= pd.to_datetime(df['end_treatment_date'])

df['treatment_duration'] = df['end_date'] - df['start_date']


In [5]:
df.drop(['diagnosis_date','end_treatment_date','start_date','end_date', 'country'], axis=1, inplace=True)


In [6]:
df.head()

Unnamed: 0,index,id,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,773684,773685,37.0,Male,Stage II,No,Current Smoker,34.5,241,0,0,0,0,Surgery,0,594 days
1,278119,278120,63.0,Female,Stage III,No,Passive Smoker,22.2,162,1,1,0,0,Combined,0,618 days
2,810422,810423,63.0,Female,Stage III,No,Former Smoker,22.8,230,0,0,1,0,Combined,1,565 days
3,443587,443588,71.0,Male,Stage II,No,Never Smoked,32.1,293,0,0,0,0,Chemotherapy,1,623 days
4,701478,701479,45.0,Female,Stage I,No,Current Smoker,29.0,173,1,0,0,0,Surgery,0,576 days


In [7]:
df['treatment_duration']=df['treatment_duration'].dt.days

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               5000 non-null   int64  
 1   id                  5000 non-null   int64  
 2   age                 5000 non-null   float64
 3   gender              5000 non-null   object 
 4   cancer_stage        5000 non-null   object 
 5   family_history      5000 non-null   object 
 6   smoking_status      5000 non-null   object 
 7   bmi                 5000 non-null   float64
 8   cholesterol_level   5000 non-null   int64  
 9   hypertension        5000 non-null   int64  
 10  asthma              5000 non-null   int64  
 11  cirrhosis           5000 non-null   int64  
 12  other_cancer        5000 non-null   int64  
 13  treatment_type      5000 non-null   object 
 14  survived            5000 non-null   int64  
 15  treatment_duration  5000 non-null   int64  
dtypes: flo

In [9]:
df.head()

Unnamed: 0,index,id,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,773684,773685,37.0,Male,Stage II,No,Current Smoker,34.5,241,0,0,0,0,Surgery,0,594
1,278119,278120,63.0,Female,Stage III,No,Passive Smoker,22.2,162,1,1,0,0,Combined,0,618
2,810422,810423,63.0,Female,Stage III,No,Former Smoker,22.8,230,0,0,1,0,Combined,1,565
3,443587,443588,71.0,Male,Stage II,No,Never Smoked,32.1,293,0,0,0,0,Chemotherapy,1,623
4,701478,701479,45.0,Female,Stage I,No,Current Smoker,29.0,173,1,0,0,0,Surgery,0,576


In [10]:
y = df['survived']  #target variable

In [11]:
df.drop('survived',axis=1, inplace=True) #training data features

In [12]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split




In [13]:


num_features = df.select_dtypes(exclude="object").columns
cat_features = df.select_dtypes(include="object").columns

print(num_features)
print(cat_features)

Index(['index', 'id', 'age', 'bmi', 'cholesterol_level', 'hypertension',
       'asthma', 'cirrhosis', 'other_cancer', 'treatment_duration'],
      dtype='object')
Index(['gender', 'cancer_stage', 'family_history', 'smoking_status',
       'treatment_type'],
      dtype='object')


In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ], remainder="drop"
)



In [16]:


#y = preprocessor.transform(y)

X = df
X.shape 
#y.shape

(5000, 15)

In [17]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)



X_train.shape, X_test.shape

X_train = preprocessor.fit_transform(X_train)

X_test = preprocessor.transform(X_test)

X_train, y_train = SMOTE(random_state=42).fit_resample(X_train,y_train)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1
    


In [19]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Classifier": SVC(probability=True),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list = []
f1_list = []



for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy, train_precision, train_recall, train_f1 = evaluate_model(y_train, y_train_pred)
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])  # Print model name
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(train_accuracy))
    print("- Precision: {:.4f}".format(train_precision))
    print("- Recall: {:.4f}".format(train_recall))
    print("- F1 Score: {:.4f}".format(train_f1))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(test_accuracy))
    print("- Precision: {:.4f}".format(test_precision))
    print("- Recall: {:.4f}".format(test_recall))
    print("- F1 Score: {:.4f}".format(test_f1))

    f1_list.append(test_f1) 
    
    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy: 0.5219
- Precision: 0.5221
- Recall: 0.5181
- F1 Score: 0.5201
----------------------------------
Model performance for Test set
- Accuracy: 0.4904
- Precision: 0.2190
- Recall: 0.4876
- F1 Score: 0.3023


K-Nearest Neighbors
Model performance for Training set
- Accuracy: 0.8180
- Precision: 0.7381
- Recall: 0.9859
- F1 Score: 0.8442
----------------------------------
Model performance for Test set
- Accuracy: 0.5272
- Precision: 0.2459
- Recall: 0.5265
- F1 Score: 0.3352


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.6416
- Precision: 0.2259
- Recall: 0.2403
- F1 Score: 0.2329


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test s

In [None]:
# needs restructuring since the data is big to train on local

In [None]:
df.reducesize(0.2)

AttributeError: 'DataFrame' object has no attribute 'reducesize'