In [1]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier, Pool

In [2]:
files = [
    "processed.cleveland.data",
    "processed.hungarian.data",
    "processed.switzerland.data",
    "processed.va.data"
]

# Define column names (as per the 14 attributes mentioned earlier)
column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Load and combine all processed files
combined_df = pd.concat(
    [pd.read_csv(file, header=None, names=column_names, na_values="?") for file in files],
    axis=0
)

# Save combined dataset to a new CSV
combined_df.to_csv("combined_heart_disease_data.csv", index=False)

# Preview the combined dataset
print(f"Combined dataset shape: {combined_df.shape}")
print(combined_df.head())


Combined dataset shape: (920, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       2  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  


In [3]:
combined_df['target'] = combined_df['target'].astype(int)

# Identify categorical columns (which CatBoost can handle directly)
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Fill missing values and convert to int properly
for col in categorical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0]).apply(int)  # Ensures int type

# Split into features (X) and target (y)
X = combined_df.drop(columns=['target'])
y = combined_df['target']

# Train-Test-Validation Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train Set: {X_train.shape}, Validation Set: {X_val.shape}, Test Set: {X_test.shape}")


Train Set: (644, 13), Validation Set: (138, 13), Test Set: (138, 13)


In [4]:
param_grid = {
    'iterations': [500, 1000, 1500],  # Different numbers of boosting rounds
    'depth': [4, 6, 8, 10],  # Tree depth
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rates to test
    'l2_leaf_reg': [1, 5, 10],  # L2 regularization strength
    'border_count': [32, 64, 128],  # Number of splits for numerical features
}

# Initialize CatBoostClassifier (without verbose to reduce log clutter)
catboost_model = CatBoostClassifier(cat_features=categorical_cols, loss_function="MultiClass", verbose=0, random_seed=42)

# Use GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring='roc_auc_ovr', n_jobs=-1, verbose=3)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from GridSearchCV
best_params = grid_search.best_params_
print("🔹 Best Hyperparameters from GridSearchCV:", best_params)

# Train the best model using the optimal parameters
best_model = CatBoostClassifier(cat_features=categorical_cols, loss_function="MultiClass", verbose=100, random_seed=42, **best_params)
best_model.fit(X_train, y_train)

# Make Predictions (Probability Scores for Multi-Class)
y_pred_proba = best_model.predict_proba(X_test)

# Convert Probability Scores to Class Predictions
y_pred = y_pred_proba.argmax(axis=1)  # Pick the class with the highest probability

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV 1/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=0.780 total time=   4.1s
[CV 4/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=0.795 total time=   4.1s
[CV 5/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=0.783 total time=   4.2s
[CV 1/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.05;, score=0.789 total time=   4.2s
[CV 2/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=0.790 total time=   4.3s
[CV 3/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=0.782 total time=   4.4s
[CV 2/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.05;, score=0.811 total time=   4.5s
[CV 4/5] END border_count=32, depth=4, iterations=500, l2_leaf_reg=1, learning_rat

In [5]:
# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

# Print Results
print(f"\n🔹 Model Evaluation Metrics:")
print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall (Sensitivity): {recall:.4f}")
print(f"✅ F1-Score: {f1:.4f}")
print(f"✅ Multi-Class ROC-AUC Score: {roc_auc:.4f}")


🔹 Model Evaluation Metrics:
✅ Accuracy: 0.5580
✅ Precision: 0.4666
✅ Recall (Sensitivity): 0.5580
✅ F1-Score: 0.5061
✅ Multi-Class ROC-AUC Score: 0.8040


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
