In [48]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier

In [104]:
df = pd.read_csv("heart_3datasets_section4_comimp_for_modeling.csv")
df.loc[df['target'] == 2, 'target'] = 1

In [105]:
X = df.drop('target', axis=1)
y = df['target']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3091 entries, 0 to 3090
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         3091 non-null   float64
 1   trestbps    3091 non-null   float64
 2   chol        3091 non-null   float64
 3   thalach     3091 non-null   float64
 4   oldpeak     3091 non-null   float64
 5   ThalCA_PC1  3091 non-null   float64
 6   ThalCA_PC2  3091 non-null   float64
 7   cp          3091 non-null   float64
 8   exang       3091 non-null   float64
 9   fbs         3091 non-null   float64
 10  restecg     3091 non-null   float64
 11  sex         3091 non-null   float64
 12  slope       3091 non-null   float64
 13  target      3091 non-null   float64
dtypes: float64(14)
memory usage: 338.2 KB


In [106]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_trainval,y_trainval, test_size=0.2, random_state=42, stratify=y_trainval)


# uncomment this for dataset phase 4
# cat_feats = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope']

# uncomment this for dataset phase 1,2
# cat_feats = ['sex', 'Chest pain type', 'fasting blood pressure', 'resting ecg', 'exercise angina', 'ST slope']

In [107]:
base_model = ExtraTreesClassifier(random_state=42)

from imblearn.over_sampling import SMOTENC, SMOTE

smote_enc = SMOTENC(random_state=42, categorical_features=cat_feats)
smote = SMOTE(random_state=42)
# smote_enc = SMOTE(random_state=42)

X_train, y_train = smote_enc.fit_resample(X_train, y_train)



In [108]:
base_model.fit(X_train, y_train)

In [109]:
y_pred = base_model.predict(X_test)

In [110]:
print('F1 score: ', f1_score(y_pred, y_test))
print('Accuracy score: ', accuracy_score(y_pred, y_test))
print('Precision score: ', precision_score(y_pred, y_test))
print('Recall score: ', recall_score(y_pred, y_test))
print('roc-auc', roc_auc_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

F1 score:  0.8973607038123167
Accuracy score:  0.8869143780290791
Precision score:  0.884393063583815
Recall score:  0.9107142857142857
roc-auc 0.88468576476527
              precision    recall  f1-score   support

         0.0       0.89      0.86      0.87       283
         1.0       0.88      0.91      0.90       336

    accuracy                           0.89       619
   macro avg       0.89      0.88      0.89       619
weighted avg       0.89      0.89      0.89       619



In [66]:
base_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [95]:
y_train.value_counts()

target
1    1009
0    1009
Name: count, dtype: int64

In [113]:
import optuna
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import load_iris
from sklearn.exceptions import FitFailedWarning
import warnings

# --- 2. Define the Objective Function for Optuna ---
def objective(trial):
    """
    Defines the search space and the metric to optimize.
    'trial' is an Optuna object suggesting hyperparameter values.
    """
    
    # --- Define the Hyperparameter Search Space ---
    # We use 'suggest_' methods to define the range for each parameter
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
    }

    
    # --- Initialize and Evaluate the Model ---
    model = ExtraTreesClassifier(**params, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    return accuracy_score(y_pred, y_val)


# --- 3. Run the Optimization ---

# Suppress warnings during optimization (optional)
warnings.filterwarnings('ignore', category=FitFailedWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Create a study object and specify direction ("maximize" accuracy)
study = optuna.create_study(direction='maximize')

# Start the optimization. 'n_trials' is the number of combinations to test.
study.optimize(objective, n_trials=100, show_progress_bar=True)

# --- 4. Get the Results ---
warnings.filterwarnings('default') # Re-enable warnings

print("\n" + "="*30)
print("Optuna Optimization Finished.")
print(f"Best trial accuracy: {study.best_value:.4f}")
print("Best hyperparameters found:")
print(study.best_params)

# --- 5. Train Final Model (Optional) ---
# You can now train the final model using the best parameters
best_params = study.best_params
final_model = ExtraTreesClassifier(**best_params, random_state=42)
# smote_enc = SMOTE(random_state=42)
# X_trainval, y_trainval = smote_enc.fit_resample(X_trainval, y_trainval)
final_model.fit(X_trainval, y_trainval)
print("\nFinal model trained with best parameters.")

[I 2025-09-21 23:51:31,104] A new study created in memory with name: no-name-16e36308-371f-44d1-baba-ca041a7d3d9f


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-09-21 23:51:31,902] Trial 0 finished with value: 0.8363636363636363 and parameters: {'n_estimators': 897, 'max_depth': 12, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 0 with value: 0.8363636363636363.
[I 2025-09-21 23:51:32,241] Trial 1 finished with value: 0.806060606060606 and parameters: {'n_estimators': 486, 'max_depth': 24, 'min_samples_split': 6, 'min_samples_leaf': 15, 'max_features': 'sqrt', 'criterion': 'gini'}. Best is trial 0 with value: 0.8363636363636363.
[I 2025-09-21 23:51:32,476] Trial 2 finished with value: 0.795959595959596 and parameters: {'n_estimators': 342, 'max_depth': 21, 'min_samples_split': 10, 'min_samples_leaf': 18, 'max_features': 'log2', 'criterion': 'gini'}. Best is trial 0 with value: 0.8363636363636363.
[I 2025-09-21 23:51:32,991] Trial 3 finished with value: 0.806060606060606 and parameters: {'n_estimators': 464, 'max_depth': 6, 'min_samples_split': 18, 'min_samples_leaf': 7, 'm

In [114]:
y_pred = final_model.predict(X_test)
print('F1 score: ', f1_score(y_pred, y_test))
print('Accuracy score: ', accuracy_score(y_pred, y_test))
print('Precision score: ', precision_score(y_pred, y_test))
print('Recall score: ', recall_score(y_pred, y_test))
print('roc-auc', roc_auc_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

F1 score:  0.9025487256371814
Accuracy score:  0.8949919224555735
Precision score:  0.869942196531792
Recall score:  0.9376947040498442
roc-auc 0.8933439963202241
              precision    recall  f1-score   support

         0.0       0.93      0.85      0.89       298
         1.0       0.87      0.94      0.90       321

    accuracy                           0.89       619
   macro avg       0.90      0.89      0.89       619
weighted avg       0.90      0.89      0.89       619



In [100]:
another_model = ExtraTreesClassifier(**best_params, random_state=42)
another_model.fit(X_trainval, y_trainval)
predict = another_model.predict(X_test)
print('F1 score: ', f1_score(predict, y_test))
print('Accuracy score: ', accuracy_score(predict, y_test))
print('Precision score: ', precision_score(predict, y_test))
print('Recall score: ', recall_score(predict, y_test))
print('roc-auc', roc_auc_score(predict, y_test))
print(classification_report(predict, y_test))

F1 score:  0.9058441558441559
Accuracy score:  0.9006849315068494
Precision score:  0.8829113924050633
Recall score:  0.93
roc-auc 0.8998591549295776
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       284
           1       0.88      0.93      0.91       300

    accuracy                           0.90       584
   macro avg       0.90      0.90      0.90       584
weighted avg       0.90      0.90      0.90       584

