In [15]:
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from catboost import CatBoostClassifier, Pool

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
files = [
    "processed.cleveland.data",
    "processed.hungarian.data",
    "processed.switzerland.data",
    "processed.va.data"
]

# Define column names (as per the 14 attributes mentioned earlier)
column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Load and combine all processed files
combined_df = pd.concat(
    [pd.read_csv(file, header=None, names=column_names, na_values="?") for file in files],
    axis=0
)

# Save combined dataset to a new CSV
combined_df.to_csv("combined_heart_disease_data.csv", index=False)

# Preview the combined dataset
print(f"Combined dataset shape: {combined_df.shape}")
print(combined_df.head())


Combined dataset shape: (920, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       2  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  


In [17]:
combined_df['target'] = combined_df['target'].astype(int)

# Identify categorical columns (which CatBoost can handle directly)
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Fill missing values and convert to int properly
for col in categorical_cols:
    combined_df.loc[:, col] = combined_df[col].fillna(combined_df[col].mode()[0]).astype(int)

# Split into features (X) and target (y)
X = combined_df.drop(columns=['target'])
y = combined_df['target']

# Train-Test-Validation Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train Set: {X_train.shape}, Validation Set: {X_val.shape}, Test Set: {X_test.shape}")


Train Set: (644, 13), Validation Set: (138, 13), Test Set: (138, 13)


In [18]:
# Define the Optuna optimization function
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 128),
        'loss_function': 'MultiClass',  # Specify multi-class classification
    }

    # Create and train the model
    model = CatBoostClassifier(cat_features=categorical_cols, verbose=0, random_seed=42, **params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

    # Predict probabilities for validation set
    y_val_proba = model.predict_proba(X_val)

    # Compute multi-class AUC-ROC
    roc_auc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')

    return roc_auc  # Optimize for the highest AUC-ROC score

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
print("Best Hyperparameters:", study.best_params)

# Train the best model using the best parameters
best_params = study.best_params
best_model = CatBoostClassifier(cat_features=categorical_cols, verbose=100, random_seed=42, **best_params)
best_model.fit(X_train, y_train)

# Make Predictions (Probability Scores for Multi-Class)
y_pred_proba = best_model.predict_proba(X_test)

# Convert Probability Scores to Class Predictions
y_pred = y_pred_proba.argmax(axis=1)  # Pick the class with the highest probability


[I 2025-02-09 14:13:50,604] A new study created in memory with name: no-name-0d9b943e-bc4b-409e-9294-3de3a5ed2f1d
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
[W 2025-02-09 14:13:50,607] Trial 0 failed with parameters: {'iterations': 340, 'depth': 8, 'learning_rate': 0.020489724231611838, 'l2_leaf_reg': 6, 'border_count': 59} because of the following error: CatBoostError('Invalid type for cat_feature[non-default value idx=0,feature_idx=1]=1.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.').
Traceback (most recent call last):
  File "_catboost.pyx", line 2613, in _catboost.get_cat_factor_bytes_representation
  File "_catboost.pyx", line 2128, in _catboost.get_id_object_bytes_string_representation
_catboost.CatBoostError: bad object for id: 1.0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=1]=1.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.