In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [32]:
df = pd.read_csv("heart_3datasets_thal_ca_MODIFIED_2PCA.csv")
df.loc[df['target'] == 2, 'target'] = 1

In [33]:
X = df.drop('target', axis=1)
y = df['target']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ThalCA_PC1  2919 non-null   float64
 1   ThalCA_PC2  2919 non-null   float64
 2   age         2919 non-null   float64
 3   chol        2919 non-null   float64
 4   cp          2919 non-null   int64  
 5   exang       2919 non-null   int64  
 6   fbs         2919 non-null   int64  
 7   oldpeak     2919 non-null   float64
 8   restecg     2919 non-null   int64  
 9   sex         2919 non-null   int64  
 10  slope       2919 non-null   int64  
 11  thalach     2919 non-null   float64
 12  trestbps    2919 non-null   float64
 13  target      2919 non-null   int64  
dtypes: float64(7), int64(7)
memory usage: 319.4 KB


In [34]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_trainval,y_trainval, test_size=0.2, random_state=42, stratify=y_trainval)

#phase 3 comment cat_features
#use this for phase 4
cat_feats = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

#use this for phase 1,2
# cat_feats = ['sex', 'Chest pain type', 'fasting blood pressure', 'resting ecg', 'exercise angina', 'ST slope']

In [35]:
base_model = RandomForestClassifier(random_state=42)
from imblearn.over_sampling import SMOTENC, SMOTE
# smote_enc = SMOTENC(random_state=42, categorical_features=cat_feats)
smote_enc = SMOTE(random_state=42)

X_trainval, y_trainval = smote_enc.fit_resample(X_trainval, y_trainval)


In [36]:
base_model.fit(X_trainval, y_trainval)
y_pred = base_model.predict(X_test)
print('F1 score: ', f1_score(y_pred, y_test))
print('Accuracy score: ', accuracy_score(y_pred, y_test))
print('Precision score: ', precision_score(y_pred, y_test))
print('Recall score: ', recall_score(y_pred, y_test))
print('roc-auc', roc_auc_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

F1 score:  0.8881789137380192
Accuracy score:  0.8801369863013698
Precision score:  0.879746835443038
Recall score:  0.896774193548387
roc-auc 0.8790440310807629
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       274
           1       0.88      0.90      0.89       310

    accuracy                           0.88       584
   macro avg       0.88      0.88      0.88       584
weighted avg       0.88      0.88      0.88       584



In [37]:
import optuna
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris

# 1. Load your data
# 2. Define the objective function
def objective(trial):
    # --- Define the Hyperparameter Search Space ---
    # We suggest values for Optuna to try
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
    # --- Create the Model ---
    # Pass the suggested hyperparameters to the model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    # --- Evaluate the Model ---
    # We use 3-fold cross-validation to get a robust score
    # You can also use a single validation split for faster tuning
    score = cross_val_score(model, X_trainval, y_trainval, n_jobs=-1, cv=5)
    accuracy = score.mean()
    
    # Optuna will try to maximize this return value
    return accuracy

# 3. Create a study object and run the optimization
# We want to maximize accuracy, so we set direction='maximize'
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) # Run 100 trials

# --- 4. Get the best results ---
print("Best trial:")
trial = study.best_trial

print(f"  Value (Accuracy): {trial.value:.4f}")
print("  Best Hyperparameters: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# You can now train your final model with these parameters
best_params = study.best_params
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X_trainval, y_trainval)

[I 2025-09-22 01:41:41,287] A new study created in memory with name: no-name-ac77193d-e9a6-4e6d-bb34-011771d69e7e
[I 2025-09-22 01:41:43,047] Trial 0 finished with value: 0.729988213107025 and parameters: {'n_estimators': 800, 'max_depth': 2, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.729988213107025.
[I 2025-09-22 01:41:44,256] Trial 1 finished with value: 0.7886712242652837 and parameters: {'n_estimators': 522, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 11, 'max_features': 'log2'}. Best is trial 1 with value: 0.7886712242652837.
[I 2025-09-22 01:41:45,299] Trial 2 finished with value: 0.798985541411284 and parameters: {'n_estimators': 290, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 2 with value: 0.798985541411284.
[I 2025-09-22 01:41:46,524] Trial 3 finished with value: 0.764088480276599 and parameters: {'n_estimators': 323, 'max_depth': 3, 'min_sam

Best trial:
  Value (Accuracy): 0.8775
  Best Hyperparameters: 
    n_estimators: 778
    max_depth: 13
    min_samples_split: 2
    min_samples_leaf: 2
    max_features: sqrt


In [38]:
y_pred = final_model.predict(X_test)
print('F1 score: ', f1_score(y_pred, y_test))
print('Accuracy score: ', accuracy_score(y_pred, y_test))
print('Precision score: ', precision_score(y_pred, y_test))
print('Recall score: ', recall_score(y_pred, y_test))
print('roc-auc', roc_auc_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

F1 score:  0.886435331230284
Accuracy score:  0.8767123287671232
Precision score:  0.8892405063291139
Recall score:  0.8836477987421384
roc-auc 0.8760344256868587
              precision    recall  f1-score   support

           0       0.86      0.87      0.87       266
           1       0.89      0.88      0.89       318

    accuracy                           0.88       584
   macro avg       0.88      0.88      0.88       584
weighted avg       0.88      0.88      0.88       584



In [39]:
study.best_params

{'n_estimators': 778,
 'max_depth': 13,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt'}