In [None]:

def evaluate_model(model, X, y, metrics=None, average='binary', verbose=True):
    """
    Evaluate any trained model with common metrics and optional plots.
    Works for both FlexibleModel and regular sklearn models.

    Parameters
    ----------
    model : object
        Trained model with predict() and optionally predict_proba().
    X : pd.DataFrame or np.ndarray
        Test features.
    y : pd.Series or np.ndarray
        True labels.
    metrics : list[str], optional
        Which metrics to compute (default: all).
        Options: ['accuracy','precision','recall','f1','roc_auc']
    average : str, default='binary'
        Averaging method for F1/Precision/Recall when y has >2 classes.
    verbose : bool, default=True
        Whether to print and plot results.

    Returns
    -------
    results : dict
        Metric results and confusion matrix DataFrame.
    """

    if metrics is None:
        metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

    y_pred = model.predict(X, y)
    y_pred = np.asarray(y_pred).reshape(-1)

    try:
        y_prob = model.predict_proba(X)
        y_prob = np.asarray(y_prob).reshape(-1)
    except Exception:
        y_prob = None

    results = {}
    if 'accuracy' in metrics:
        results['accuracy'] = accuracy_score(y, y_pred)
    if 'precision' in metrics:
        results['precision'] = precision_score(y, y_pred, average=average, zero_division=0)
    if 'recall' in metrics:
        results['recall'] = recall_score(y, y_pred, average=average, zero_division=0)
    if 'f1' in metrics:
        results['f1'] = f1_score(y, y_pred, average=average, zero_division=0)
    if 'roc_auc' in metrics and y_prob is not None and len(np.unique(y)) == 2:
        results['roc_auc'] = roc_auc_score(y, y_prob)

    cm = confusion_matrix(y, y_pred)
    cm_df = pd.DataFrame(cm,
                         index=[f'True_{c}' for c in np.unique(y)],
                         columns=[f'Pred_{c}' for c in np.unique(y)])
    results['confusion_matrix'] = cm_df

    if verbose:
        print("📊 Model Evaluation Summary")
        print(pd.Series(results).drop('confusion_matrix', errors='ignore'))
        print("\nConfusion Matrix:\n", cm_df)
        print("\nClassification Report:")
        print(classification_report(y, y_pred, zero_division=0))

        plt.figure(figsize=(4,4))
        plt.imshow(cm, cmap='Blues', interpolation='nearest')
        plt.title('Confusion Matrix')
        plt.colorbar()
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()

    return results

In [None]:
# # Assuming you already have: model.fit(X_train, y_train)
# eval_results = evaluate_model(model, X_test, y_test)

# # Access metrics directly
# print(eval_results['accuracy'])
# print(eval_results['roc_auc'])
# eval_results['confusion_matrix']