# SVC Model for OASIS-2 Label Classification

In [None]:
#-----------------------------------------------------------------------------------------------------------------
# Import Libraries
#-----------------------------------------------------------------------------------------------------------------
import os
import random
import logging
import pickle
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, precision_recall_curve,
    auc, confusion_matrix
)
from sklearn.inspection import permutation_importance
from imblearn.combine import SMOTETomek
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from IPython.display import clear_output

#-----------------------------------------------------------------------------------------------------------------
# Configuration object for hyperparameters and file paths
#-----------------------------------------------------------------------------------------------------------------
class Config:
    # File paths
    BASE_PATH = ("PATH_TO_PREPROCESSED_OASIS2_DATASET")
    PROCESSED_DATA_PATH = os.path.join(BASE_PATH, 'processed_dataset.csv')
    MODEL_SAVE_PATH = os.path.join(BASE_PATH, 'best_svm_model.pkl')
    SEARCH_RESULTS_PATH = os.path.join(BASE_PATH, 'hyperparameter_search_results_svm.csv')
    FEATURE_IMPORTANCES_PATH = os.path.join(BASE_PATH, 'feature_importances_svm.csv')
    FEATURE_IMPORTANCES_PLOT_PATH = os.path.join(BASE_PATH, 'feature_importances_svm_plot.png')
    TRAIN_PREDICTIONS_PATH = os.path.join(BASE_PATH, 'train_dataset_with_predictions_svm.csv')
    TEST_PREDICTIONS_PATH = os.path.join(BASE_PATH, 'test_dataset_with_predictions_svm.csv')
    METRICS_SAVE_PATH = os.path.join(BASE_PATH, 'model_evaluation_metrics_svm.csv')
    CV_RESULTS_PATH = os.path.join(BASE_PATH, 'cross_validation_results_svm.csv')
    LABEL_ENCODER_PATH = os.path.join(BASE_PATH, 'label_encoder.pkl')  
    FEATURE_NAMES_PATH = os.path.join(BASE_PATH, 'feature_names.csv')  

    # Hyperparameters
    TEST_SIZE = 0.3
    RANDOM_STATE = 42
    C_RANGE = (0.05, 10)  
    KERNEL_OPTIONS = ['linear', 'rbf', 'poly', 'sigmoid'] 
    DEGREE_RANGE = (2, 7)  
    GAMMA_OPTIONS = ['scale', 'auto']  
    COEF0_RANGE = (0, 10)  
    ACCEPTABLE_TRAIN_AUC_RANGE = #(, )
    ACCEPTABLE_TEST_AUC_RANGE = #(, )
    MAX_TRIALS = 200
    MIN_RECALL_TEST = 0.90
    CV_FOLDS = 5  

#-----------------------------------------------------------------------------------------------------------------
# Load, Preprocess & Split the Data
#-----------------------------------------------------------------------------------------------------------------
def load_data(file_path):
    """Load the processed dataset from a CSV file."""
    logger.info(f"Loading data from {file_path}")
    try:
        df = pd.read_csv(file_path)
        logger.info(f"Data loaded successfully with shape {df.shape}")
        return df
    except Exception as e:
        logger.error(f"Failed to load data: {e}")
        raise

def preprocess_data(df):
    """Separate features and target."""
    logger.info("Preprocessing data")
    
    X = df.drop(columns=['Target', 'Subject ID', 'Visit', 'Time_Since_Last_Visit',
                         'CDR', 'CDR_Change', 'Cumulative_CDR_Change']) 
    
    y = df['Target']
    groups = df['Subject ID']
    return X, y, groups

def split_data(X, y, groups, test_size, random_state):
    """Split data into training and testing sets using GroupShuffleSplit."""
    logger.info("Splitting data into training and testing sets")
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    for train_idx, test_idx in gss.split(X, y, groups=groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
    logger.info(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test, groups_train, groups_test, train_idx, test_idx

#-----------------------------------------------------------------------------------------------------------------
# Resampling to Handle Class Imbalance
#-----------------------------------------------------------------------------------------------------------------
def compute_class_weights(y):
    """Compute class weights to handle class imbalance."""
    logger.info("Computing class weights")
    classes = np.unique(y)
    class_weights = compute_class_weight('balanced', classes=classes, y=y)
    class_weights_dict = dict(zip(classes, class_weights))
    logger.info(f"Class weights: {class_weights_dict}")
    return class_weights_dict

def apply_smote_tomek(X_train, y_train, random_state):
    """Apply SMOTE with Tomek Links to balance the training data."""
    logger.info("Applying SMOTE with Tomek Links to the training data")
    smote_tomek = SMOTETomek(random_state=random_state)
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
    logger.info(f"Resampled training data shape: {X_resampled.shape}")
    return X_resampled, y_resampled

#-----------------------------------------------------------------------------------------------------------------
# Random Search to Find Hyperparameters
#-----------------------------------------------------------------------------------------------------------------
def random_hyperparameter_search(
    X_train, y_train, X_test, y_test, class_weights, config
):
    """Perform random hyperparameter search and model evaluation using SVM in a Pipeline."""
    logger.info("Starting random hyperparameter search with SVM in Pipeline")
    best_model = None
    best_params = None
    best_auc = 0
    results = []
    train_aucs = []
    test_aucs = []
    train_recalls = []
    test_recalls = []
    criteria_met = False

    # Define columns to scale and pass through
    columns_to_scale = [col for col in X_train.columns if col not in ['M/F_M', 'EDUC', 'SES']]
    columns_to_passthrough = ['M/F_M', 'EDUC', 'SES']

    for trial in range(1, config.MAX_TRIALS + 1):
        # Randomly sample hyperparameters
        C = random.uniform(config.C_RANGE[0], config.C_RANGE[1])
        kernel = random.choice(config.KERNEL_OPTIONS)
        
        if kernel == 'poly':
            degree = random.randint(config.DEGREE_RANGE[0], config.DEGREE_RANGE[1])
        else:
            degree = 3

        if kernel in ['rbf', 'poly', 'sigmoid']:
            gamma = random.choice(config.GAMMA_OPTIONS + [random.uniform(0.001, 1.0)])
        else:
            gamma = 'scale'

        if kernel in ['poly', 'sigmoid']:
            coef0 = random.uniform(config.COEF0_RANGE[0], config.COEF0_RANGE[1])
        else:
            coef0 = 0

        # Define the pipeline with SVC (probability=True)
        model = Pipeline([
            ('preprocessor', ColumnTransformer(
                transformers=[
                    ('scale', StandardScaler(), columns_to_scale),
                    ('passthrough', 'passthrough', columns_to_passthrough)
                ]
            )),
            ('svc', SVC(
                C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0,
                class_weight=class_weights, random_state=config.RANDOM_STATE, probability=True
            ))
        ])

        # Train the model
        try:
            model.fit(X_train, y_train)
        except Exception as e:
            logger.error(f"Failed to train model on trial #{trial}: {e}")
            continue

        # Evaluate on training data
        y_train_pred = model.predict(X_train)
        y_train_pred_proba = model.predict_proba(X_train)
        train_auc = roc_auc_score(y_train, y_train_pred_proba, multi_class='ovo')
        train_recall = recall_score(y_train, y_train_pred, average='weighted', zero_division=0)

        # Evaluate on test data
        y_test_pred = model.predict(X_test)
        y_test_pred_proba = model.predict_proba(X_test)
        test_auc = roc_auc_score(y_test, y_test_pred_proba, multi_class='ovo')
        test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)

        # Store metrics
        train_aucs.append(train_auc)
        test_aucs.append(test_auc)
        train_recalls.append(train_recall)
        test_recalls.append(test_recall)

        # Clear previous output and display current trial information
        clear_output(wait=True)
        print(f"Trial #{trial}")
        if trial > 1:
            print(f"Train ROC AUC Range (Trials 1-{trial-1}): "
                  f"[{min(train_aucs):.4f}, {max(train_aucs):.4f}]")
            print(f"Test ROC AUC Range (Trials 1-{trial-1}): "
                  f"[{min(test_aucs):.4f}, {max(test_aucs):.4f}]")
            print(f"Train Minority Recall Range (Trials 1-{trial-1}): "
                  f"[{min(train_recalls):.4f}, {max(train_recalls):.4f}]")
            print(f"Test Minority Recall Range (Trials 1-{trial-1}): "
                  f"[{min(test_recalls):.4f}, {max(test_recalls):.4f}]")

        # Update best model if current test AUC is better
        if test_auc > best_auc:
            best_auc = test_auc
            best_model = model
            best_params = {
                'C': C,
                'kernel': kernel,
                'degree': degree,
                'gamma': gamma,
                'coef0': coef0,
                'train_auc': train_auc,
                'test_auc': test_auc,
                'train_recall': train_recall,
                'test_recall': test_recall,
                'criteria_met': False
            }

        # Check if criteria are met
        if (train_auc >= config.ACCEPTABLE_TRAIN_AUC_RANGE[0] and 
            test_auc >= config.ACCEPTABLE_TEST_AUC_RANGE[0] and 
            test_recall >= config.MIN_RECALL_TEST):
            criteria_met = True
            best_params['criteria_met'] = True
            print(f"[✓] Criteria met — Train ROC AUC: {train_auc:.4f}, "
                  f"Test ROC AUC: {test_auc:.4f}, "
                  f"Train Minority Recall: {train_recall:.4f}, "
                  f"Test Minority Recall: {test_recall:.4f}")

        # Store the results
        results.append({
            'Trial': trial,
            'C': C,
            'kernel': kernel,
            'degree': degree,
            'gamma': gamma,
            'coef0': coef0,
            'Train ROC AUC': train_auc,
            'Test ROC AUC': test_auc,
            'Train Recall': train_recall,
            'Test Recall': test_recall
        })

        if trial % 100 == 0:
            logger.info(f"Trial {trial}: Highest Test ROC AUC so far = {best_auc:.4f}")

    # Save results
    results_df = pd.DataFrame(results)
    logger.info("Random hyperparameter search completed.")
    if criteria_met:
        print(f"[✓] Model meeting criteria found (Test ROC AUC = {best_auc:.4f}).")
    else:
        print(f"[✗] No model met the specified criteria, but the best model (Test ROC AUC = {best_auc:.4f}) was selected.")
    return best_model, best_params, results_df

#-----------------------------------------------------------------------------------------------------------------
# Model Evaluation & Feature Importance
#-----------------------------------------------------------------------------------------------------------------
def evaluate_model(best_model, X_train_eval, y_train_eval, X_test, y_test, label_encoder_group, config):
    """Evaluate the best model and return evaluation metrics."""
    logger.info("Evaluating the best SVM model")

    # Predictions on training data
    y_train_pred = best_model.predict(X_train_eval)
    y_train_pred_proba = best_model.predict_proba(X_train_eval)

    # Predictions on test data
    y_test_pred = best_model.predict(X_test)
    y_test_pred_proba = best_model.predict_proba(X_test)

    # Evaluation Metrics for Training Data
    train_metrics = {
        'accuracy': accuracy_score(y_train_eval, y_train_pred),
        'precision': precision_score(y_train_eval, y_train_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_train_eval, y_train_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_train_eval, y_train_pred, average='weighted', zero_division=0),
        'roc_auc': roc_auc_score(y_train_eval, y_train_pred_proba, multi_class='ovo'),
        'classification_report': classification_report(y_train_eval, y_train_pred, output_dict=True, zero_division=0),
        'confusion_matrix': confusion_matrix(y_train_eval, y_train_pred)
    }

    # Evaluation Metrics for Test Data
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test, y_test_pred, average='weighted', zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_test_pred_proba, multi_class='ovo'),
        'classification_report': classification_report(y_test, y_test_pred, output_dict=True, zero_division=0),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred)
    }

    # Calculate Precision-Recall AUC
    train_pr_auc = {}
    test_pr_auc = {}
    for i in np.unique(y_train_eval):
        # Training PR AUC for class i
        train_precision_i, train_recall_i, _ = precision_recall_curve(y_train_eval == i, y_train_pred_proba[:, i])
        train_pr_auc[i] = auc(train_recall_i, train_precision_i)
        # Test PR AUC for class i
        test_precision_i, test_recall_i, _ = precision_recall_curve(y_test == i, y_test_pred_proba[:, i])
        test_pr_auc[i] = auc(test_recall_i, test_precision_i)

    # Average PR AUC
    train_metrics['pr_auc'] = np.mean(list(train_pr_auc.values()))
    test_metrics['pr_auc'] = np.mean(list(test_pr_auc.values()))

    # Log evaluation metrics
    logger.info(f"Training Accuracy: {train_metrics['accuracy']:.4f}")
    logger.info(f"Testing Accuracy: {test_metrics['accuracy']:.4f}")
    logger.info("Train Classification Report:")
    logger.info(train_metrics['classification_report'])
    logger.info("Test Classification Report:")
    logger.info(test_metrics['classification_report'])
    logger.info(f"Train ROC AUC: {train_metrics['roc_auc']:.4f}")
    logger.info(f"Test ROC AUC: {test_metrics['roc_auc']:.4f}")
    logger.info(f"Train PR AUC: {train_metrics['pr_auc']:.4f}")
    logger.info(f"Test PR AUC: {test_metrics['pr_auc']:.4f}")

    # Permutation Importance Calculation
    logger.info("Calculating Permutation Importance for SVC model")
    perm_importance = permutation_importance(
        best_model, X_test, y_test, scoring='accuracy', n_repeats=10, random_state=42
    )

    feature_importances = pd.DataFrame({
        'Feature': X_test.columns,
        'Importance': perm_importance.importances_mean
    }).sort_values(by='Importance', ascending=False)

    # Save Permutation Importances to CSV
    try:
        feature_importances.to_csv(config.FEATURE_IMPORTANCES_PATH, index=False)
        logger.info(f"Permutation importances saved to {config.FEATURE_IMPORTANCES_PATH}")
    except Exception as e:
        logger.error(f"Failed to save permutation importances: {e}")
        raise

    # Plot and save feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(20)) 
    plt.title('Top 20 Permutation Feature Importances')
    plt.tight_layout()
    try:
        plt.savefig(config.FEATURE_IMPORTANCES_PLOT_PATH)
        plt.close()
        logger.info(f"Permutation importances plot saved to {config.FEATURE_IMPORTANCES_PLOT_PATH}")
    except Exception as e:
        logger.error(f"Failed to save permutation importances plot: {e}")
        raise

    return train_metrics, test_metrics, y_train_pred, y_train_pred_proba, y_test_pred, y_test_pred_proba

#-----------------------------------------------------------------------------------------------------------------
# Cross-Validation 
#-----------------------------------------------------------------------------------------------------------------
def cross_validate_model(best_model, X, y, groups, label_encoder_group, config, n_splits=5):
    """Perform group-aware cross-validation on the best model."""
    logger.info("Performing group-aware cross-validation with Pipeline")
    gkf = GroupKFold(n_splits=n_splits)
    cv_scores = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'roc_auc': [],
        'pr_auc': []
    }

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        # Apply SMOTE with Tomek Links
        X_train_resampled, y_train_resampled = apply_smote_tomek(X_train_fold, y_train_fold, config.RANDOM_STATE)

        # Update class weights for the SVC in the pipeline
        best_model.named_steps['svc'].class_weight = compute_class_weights(y_train_resampled)

        # Fit the model
        try:
            best_model.fit(X_train_resampled, y_train_resampled)
        except Exception as e:
            logger.error(f"Failed to train model in fold {fold}: {e}")
            continue

        # Evaluate on validation fold
        y_val_pred = best_model.predict(X_val_fold)
        y_val_pred_proba = best_model.predict_proba(X_val_fold)

        # Compute metrics
        cv_scores['accuracy'].append(accuracy_score(y_val_fold, y_val_pred))
        cv_scores['precision'].append(precision_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))
        cv_scores['recall'].append(recall_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))
        cv_scores['f1_score'].append(f1_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))

        # Compute ROC AUC, handling missing classes
        unique_classes = np.unique(y_val_fold)
        if len(unique_classes) < 2:
            logger.warning(f"Fold {fold} has only {len(unique_classes)} class(es). Skipping ROC AUC calculation.")
            cv_scores['roc_auc'].append(np.nan)
        else:
            try:
                class_indices = [np.where(label_encoder_group.classes_ == cls)[0][0] for cls in unique_classes]
                y_val_pred_proba_subset = y_val_pred_proba[:, class_indices]
                cv_scores['roc_auc'].append(roc_auc_score(y_val_fold, y_val_pred_proba_subset, multi_class='ovo'))
            except Exception as e:
                logger.error(f"Failed to compute ROC AUC for fold {fold}: {e}")
                cv_scores['roc_auc'].append(np.nan)

        # PR AUC for each class present in the fold
        pr_auc_fold = []
        for i in unique_classes:
            precision_i, recall_i, _ = precision_recall_curve(y_val_fold == i, y_val_pred_proba[:, np.where(label_encoder_group.classes_ == i)[0][0]])
            pr_auc_fold.append(auc(recall_i, precision_i))
        cv_scores['pr_auc'].append(np.mean(pr_auc_fold))

        logger.info(f"Fold {fold} - ROC AUC: {cv_scores['roc_auc'][-1]:.4f}, Recall: {cv_scores['recall'][-1]:.4f}")

    # Compute average metrics, ignoring NaN values
    avg_metrics = {key: np.nanmean(values) for key, values in cv_scores.items()}
    logger.info("Cross-validation results:")
    for metric, value in avg_metrics.items():
        logger.info(f"Average {metric}: {value:.4f}")

    # Save CV results
    cv_results_df = pd.DataFrame(cv_scores)
    cv_results_df['Fold'] = range(1, len(cv_scores['accuracy']) + 1)
    try:
        cv_results_df.to_csv(config.CV_RESULTS_PATH, index=False)
        logger.info(f"Cross-validation results saved to {config.CV_RESULTS_PATH}")
    except Exception as e:
        logger.error(f"Failed to save cross-validation results: {e}")
        raise

    return avg_metrics

#-----------------------------------------------------------------------------------------------------------------
# Save Model & Results
#-----------------------------------------------------------------------------------------------------------------
def save_model(model, file_path):
    """Save the trained model to a file."""
    logger.info(f"Saving model to {file_path}")
    try:
        with open(file_path, 'wb') as f:
            pickle.dump(model, f)
        logger.info("Model saved successfully.")
    except Exception as e:
        logger.error(f"Failed to save model: {e}")
        raise

def save_results(results_df, file_path):
    """Save the hyperparameter search results to a CSV file."""
    logger.info(f"Saving hyperparameter search results to {file_path}")
    try:
        results_df.to_csv(file_path, index=False)
        logger.info("Hyperparameter search results saved successfully.")
    except Exception as e:
        logger.error(f"Failed to save results: {e}")
        raise

#-----------------------------------------------------------------------------------------------------------------
# Save Predictions
#-----------------------------------------------------------------------------------------------------------------
def save_predictions(X_train_eval, y_train_eval, y_train_pred, y_train_pred_proba, groups_train, train_idx,
                     X_test, y_test, y_test_pred, y_test_pred_proba, groups_test, test_idx,
                     df_processed, label_encoder_group, config):
    """Save training and test data with predictions."""
    logger.info("Saving predictions to CSV files")

    # Training data
    train_data = X_train_eval.copy()
    train_data['Subject ID'] = groups_train.values
    train_data['Visit'] = df_processed['Visit'].iloc[train_idx].values
    train_data['Target'] = y_train_eval
    train_data['Predicted'] = y_train_pred

    # Predicted probabilities
    for idx, class_label in enumerate(label_encoder_group.classes_):
        train_data[f'Predicted_Prob_{class_label}'] = y_train_pred_proba[:, idx]

    # Test data
    test_data = X_test.copy()
    test_data['Subject ID'] = groups_test.values
    test_data['Visit'] = df_processed['Visit'].iloc[test_idx].values
    test_data['Target'] = y_test
    test_data['Predicted'] = y_test_pred

    # Predicted probabilities
    for idx, class_label in enumerate(label_encoder_group.classes_):
        test_data[f'Predicted_Prob_{class_label}'] = y_test_pred_proba[:, idx]

    # Save to CSV
    try:
        train_data.to_csv(config.TRAIN_PREDICTIONS_PATH, index=False)
        logger.info(f"Training predictions saved to {config.TRAIN_PREDICTIONS_PATH}")
    except Exception as e:
        logger.error(f"Failed to save training predictions: {e}")
        raise

    try:
        test_data.to_csv(config.TEST_PREDICTIONS_PATH, index=False)
        logger.info(f"Test predictions saved to {config.TEST_PREDICTIONS_PATH}")
    except Exception as e:
        logger.error(f"Failed to save test predictions: {e}")
        raise

#-----------------------------------------------------------------------------------------------------------------
# Save Label Encoder and Feature Names
#-----------------------------------------------------------------------------------------------------------------
def save_label_encoder(label_encoder, file_path):
    """Save the LabelEncoder object to a file."""
    logger.info(f"Saving LabelEncoder to {file_path}")
    try:
        with open(file_path, 'wb') as f:
            pickle.dump(label_encoder, f)
        logger.info("LabelEncoder saved successfully.")
    except Exception as e:
        logger.error(f"Failed to save LabelEncoder: {e}")
        raise

def save_feature_names(feature_names, file_path):
    """Save the feature names to a CSV file."""
    logger.info(f"Saving feature names to {file_path}")
    try:
        pd.DataFrame({'Feature': feature_names}).to_csv(file_path, index=False)
        logger.info("Feature names saved successfully.")
    except Exception as e:
        logger.error(f"Failed to save feature names: {e}")
        raise

#-----------------------------------------------------------------------------------------------------------------
# Save Evaluation Metrics
#-----------------------------------------------------------------------------------------------------------------
def save_evaluation_metrics(train_metrics, test_metrics, cv_metrics, config):
    """Save evaluation metrics, including cross-validation metrics, to a CSV file."""
    logger.info("Saving evaluation metrics")
    metrics_data = {
        'Dataset': ['Training', 'Test', 'Cross-Validation (Avg)'],
        'Accuracy': [
            train_metrics['accuracy'],
            test_metrics['accuracy'],
            cv_metrics['accuracy']
        ],
        'Precision': [
            train_metrics['precision'],
            test_metrics['precision'],
            cv_metrics['precision']
        ],
        'Recall': [
            train_metrics['recall'],
            test_metrics['recall'],
            cv_metrics['recall']
        ],
        'F1 Score': [
            train_metrics['f1_score'],
            test_metrics['f1_score'],
            cv_metrics['f1_score']
        ],
        'ROC AUC': [
            train_metrics['roc_auc'],
            test_metrics['roc_auc'],
            cv_metrics['roc_auc']
        ],
        'PR AUC': [
            train_metrics['pr_auc'],
            test_metrics['pr_auc'],
            cv_metrics['pr_auc']
        ]
    }

    metrics_df = pd.DataFrame(metrics_data)
    try:
        metrics_df.to_csv(config.METRICS_SAVE_PATH, index=False)
        logger.info(f"Evaluation metrics saved to {config.METRICS_SAVE_PATH}")
    except Exception as e:
        logger.error(f"Failed to save evaluation metrics: {e}")
        raise

#-----------------------------------------------------------------------------------------------------------------
# Main
#-----------------------------------------------------------------------------------------------------------------
def main():
    # Initialize configuration
    config = Config()

    # Load data
    df_processed = load_data(config.PROCESSED_DATA_PATH)

    # Preprocess data
    X, y, groups = preprocess_data(df_processed)

    # Encode labels for the full dataset
    label_encoder_group = LabelEncoder()
    y_encoded = label_encoder_group.fit_transform(y)

    # Split data
    X_train, X_test, y_train, y_test, groups_train, groups_test, train_idx, test_idx = split_data(
        X, y, groups, config.TEST_SIZE, config.RANDOM_STATE
    )

    # Encode training and test labels
    y_train_encoded = label_encoder_group.transform(y_train)
    y_test_encoded = label_encoder_group.transform(y_test)

    # Apply SMOTE with Tomek Links to the training data
    X_train_resampled, y_train_resampled = apply_smote_tomek(X_train, y_train_encoded, config.RANDOM_STATE)

    # Compute class weights after resampling
    class_weights = compute_class_weights(y_train_resampled)

    # Run hyperparameter search
    best_model, best_params, search_results = random_hyperparameter_search(
        X_train_resampled, y_train_resampled, X_test, y_test_encoded, class_weights, config
    )

    if best_model is not None:
        logger.info(f"Best Model Parameters: {best_params}")

        # Save the best model
        save_model(best_model, config.MODEL_SAVE_PATH)

        # Save hyperparameter search results
        save_results(search_results, config.SEARCH_RESULTS_PATH)

        # Evaluate the best model
        train_metrics, test_metrics, y_train_pred, y_train_pred_proba, y_test_pred, y_test_pred_proba = evaluate_model(
            best_model, X_train, y_train_encoded, X_test, y_test_encoded, label_encoder_group, config
        )

        # Perform cross-validation
        cv_metrics = cross_validate_model(
            best_model, X, y_encoded, groups, label_encoder_group, config, n_splits=config.CV_FOLDS
        )

        # Save predictions
        save_predictions(
            X_train, y_train_encoded, y_train_pred, y_train_pred_proba, groups_train, train_idx,
            X_test, y_test_encoded, y_test_pred, y_test_pred_proba, groups_test, test_idx,
            df_processed, label_encoder_group, config
        )

        # Save LabelEncoder
        save_label_encoder(label_encoder_group, config.LABEL_ENCODER_PATH)

        # Save feature names
        save_feature_names(X_train.columns, config.FEATURE_NAMES_PATH)

        # Save evaluation metrics
        save_evaluation_metrics(train_metrics, test_metrics, cv_metrics, config)

        # Print final metrics
        print(f"Best Model Metrics (Single Split):")
        print(f"Train ROC AUC: {train_metrics['roc_auc']:.4f}")
        print(f"Test ROC AUC: {test_metrics['roc_auc']:.4f}")
        print(f"Train Recall: {train_metrics['recall']:.4f}")
        print(f"Test Recall: {test_metrics['recall']:.4f}")
        print(f"\nCross-Validation Metrics (Average over {config.CV_FOLDS} folds):")
        for metric, value in cv_metrics.items():
            print(f"Average {metric}: {value:.4f}")
    else:
        logger.warning("No valid models were trained during hyperparameter search.")

if __name__ == "__main__":
    main()