In [6]:
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier, Pool

In [7]:
files = [
    "processed.cleveland.data",
    "processed.hungarian.data",
    "processed.switzerland.data",
    "processed.va.data"
]

# Define column names (as per the 14 attributes mentioned earlier)
column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Load and combine all processed files
combined_df = pd.concat(
    [pd.read_csv(file, header=None, names=column_names, na_values="?") for file in files],
    axis=0
)

# Save combined dataset to a new CSV
combined_df.to_csv("combined_heart_disease_data.csv", index=False)

# Preview the combined dataset
print(f"Combined dataset shape: {combined_df.shape}")
print(combined_df.head())


Combined dataset shape: (920, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       2  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  


In [8]:
combined_df['target'] = combined_df['target'].astype(int)

# Identify categorical columns (which CatBoost can handle directly)
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Fill missing values and convert to int properly
for col in categorical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0]).apply(int)  # Ensures int type

# Split into features (X) and target (y)
X = combined_df.drop(columns=['target'])
y = combined_df['target']

# Train-Test-Validation Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train Set: {X_train.shape}, Validation Set: {X_val.shape}, Test Set: {X_test.shape}")


Train Set: (644, 13), Validation Set: (138, 13), Test Set: (138, 13)


In [9]:
# Define the Optuna optimization function
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),  # Fix Optuna warning
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 128),
        'loss_function': 'MultiClass',
    }

    # Train the model
    model = CatBoostClassifier(cat_features=categorical_cols, verbose=0, random_seed=42, **params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

    # Predict probabilities for validation set
    y_val_proba = model.predict_proba(X_val)

    # Compute multi-class AUC-ROC
    roc_auc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')

    return roc_auc

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)

# Get the best hyperparameters
print("Best Hyperparameters:", study.best_params)

# Train the best model using the best parameters
best_params = study.best_params
best_model = CatBoostClassifier(cat_features=categorical_cols, verbose=100, random_seed=42, **best_params)
best_model.fit(X_train, y_train)

# Make Predictions (Probability Scores for Multi-Class)
y_pred_proba = best_model.predict_proba(X_test)

# Convert Probability Scores to Class Predictions
y_pred = y_pred_proba.argmax(axis=1)  # Pick the class with the highest probability


[I 2025-02-09 14:26:33,315] A new study created in memory with name: no-name-30cd1471-123a-4159-9db4-68548166742a
[I 2025-02-09 14:26:34,709] Trial 0 finished with value: 0.8478410321212729 and parameters: {'iterations': 964, 'depth': 5, 'learning_rate': 0.026231047003040926, 'l2_leaf_reg': 9, 'border_count': 71}. Best is trial 0 with value: 0.8478410321212729.
[I 2025-02-09 14:26:35,510] Trial 1 finished with value: 0.8262419287226261 and parameters: {'iterations': 665, 'depth': 8, 'learning_rate': 0.1537294085998941, 'l2_leaf_reg': 3, 'border_count': 96}. Best is trial 0 with value: 0.8478410321212729.
[I 2025-02-09 14:26:37,275] Trial 2 finished with value: 0.8473189350346783 and parameters: {'iterations': 571, 'depth': 6, 'learning_rate': 0.05180262397754698, 'l2_leaf_reg': 8, 'border_count': 74}. Best is trial 0 with value: 0.8478410321212729.
[I 2025-02-09 14:26:37,750] Trial 3 finished with value: 0.8356884414605392 and parameters: {'iterations': 514, 'depth': 5, 'learning_rate'

Best Hyperparameters: {'iterations': 424, 'depth': 4, 'learning_rate': 0.16347864643695342, 'l2_leaf_reg': 9, 'border_count': 92}
0:	learn: 1.5159647	total: 2.74ms	remaining: 1.16s
100:	learn: 0.8191053	total: 187ms	remaining: 598ms
200:	learn: 0.6964134	total: 367ms	remaining: 407ms
300:	learn: 0.6101472	total: 555ms	remaining: 227ms
400:	learn: 0.5442380	total: 740ms	remaining: 42.4ms
423:	learn: 0.5314654	total: 780ms	remaining: 0us


In [10]:
# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

# Print Results
print(f"\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Multi-Class ROC-AUC Score: {roc_auc:.4f}")



Model Evaluation Metrics:
Accuracy: 0.5942
Precision: 0.5602
Recall (Sensitivity): 0.5942
F1-Score: 0.5718
Multi-Class ROC-AUC Score: 0.7860
