## Import Necessary Libraries

In [None]:
import pandas as pd
import os

import model_metrics

print(f"Model Metrics version: {model_metrics.__version__}")

from model_tuner import loadObjects

from ucimlrepo import fetch_ucirepo
from eda_toolkit import ensure_directory

print(f"Python version: {os.sys.version}")

## Set Model Path

In [None]:
print(f"Model Metrics version: {model_metrics.__version__}")
print(f"Model Metrics authors: {model_metrics.__author__} \n")

## Define base paths
## `base_path`` represents the parent directory of current working directory
base_path = os.path.join(os.pardir)
## Go up one level from 'notebooks' to the parent directory, then into the
## 'results' folder

model_path = os.path.join(os.pardir, "model_files/results")
data_path = os.path.join(os.pardir, "model_files")
image_path_png = os.path.join(data_path, "images", "png_images")
image_path_svg = os.path.join(data_path, "images", "svg_images")

# Use the function to ensure the 'data' directory exists
ensure_directory(model_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

## Create a synthetic dataset

Let us create a synthetic dataset for classification using `make_classification` from `sklearn.datasets`.
This dataset will be used to train and evaluate multiple models.

In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Generate a synthetic dataset
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    random_state=42,
)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

# Train models
model1 = LogisticRegression(random_state=42).fit(X_train, y_train)
model2 = RandomForestClassifier(random_state=42).fit(X_train, y_train)

model_titles = ["Logistic Regression", "Random Forest"]

## Retrieve Predictions from Adult Income Dataset

In [None]:
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X_ai = adult.data.features

# X.to_csv("../data/X.csv")

In [None]:
model_lr = loadObjects(os.path.join(model_path, "LogisticRegression.pkl"))
model_dt = loadObjects(os.path.join(model_path, "DecisionTreeClassifier.pkl"))
model_rf = loadObjects(os.path.join(model_path, "RandomForestClassifier.pkl"))

X_test_ai = pd.read_parquet(os.path.join(data_path, "X_test.parquet"))
y_test_ai = pd.read_parquet(os.path.join(data_path, "y_test.parquet"))

### Set The Desired Naming Conventions

In [None]:
pipelines_or_models = [
    model_lr["model"].estimator,
    model_rf["model"].estimator,
    model_dt["model"].estimator,
]

# Model titles
model_titles_ai = [
    "Logistic Regression",
    "Random Forest Classifier",
    "Decision Tree Classifier",
]

In [None]:
X_test_2_ai = X_test_ai.join(
    X_ai[["sex", "race", "relationship", "occupation", "workclass", "education"]]
)

## Summarize model performance

### Binary Classification Example 1: Default Threshold

Here, we summarize the performance of multiple models using the default threshold of 0.5 for classification.

In [None]:
from model_metrics import summarize_model_performance

model_performance = summarize_model_performance(
    model=[model1, model2],
    model_title=model_titles,
    X=X_test,
    y=y_test,
    model_type="classification",
    return_df=True,
)

model_performance

### Binary Classification Example 2: Custom Threshold

In this example, we revisit binary classification with the same two models: Logistic Regression and Random Forest, but adjust the classification threshold (`custom_threshold` input in this case) from the default 0.5 to 0.2. This change allows us to explore how lowering the threshold impacts model performance, potentially increasing sensitivity (recall) by classifying more instances as positive (1) at the expense of precision.

In [None]:
from model_metrics import summarize_model_performance

model_performance = summarize_model_performance(
    model=[model1, model2],
    model_title=model_titles,
    X=X_test,
    y=y_test,
    model_type="classification",
    return_df=True,
    custom_threshold=0.2,
)

model_performance

### Regression Model Examples w/ Diabetes Dataset

In the next few cells, we will demonstrate model performance summarization for regression models using the diabetes dataset from `sklearn.datasets`.
This dataset will be used to train and evaluate multiple regression models, and we will summarize their performance.

In [None]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# Load dataset
diabetes = load_diabetes(as_frame=True)["frame"]
X_diabetes = diabetes.drop(columns=["target"])
y_diabetes = diabetes["target"]

# Split into train and test sets
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
    X_diabetes,
    y_diabetes,
    test_size=0.2,
    random_state=42,
)

# Train Linear Regression (on unscaled data)
linear_model = LinearRegression()
linear_model.fit(X_train_diabetes, y_train_diabetes)
# Train Random Forest Regressor (on unscaled data)
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
)
rf_model.fit(X_train_diabetes, y_train_diabetes)

# Train Ridge Regression (on scaled data)
ridge_model = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("estimator", Ridge(alpha=1.0)),
    ]
)
ridge_model.fit(X_train_diabetes, y_train_diabetes)

#### Regression Example 1: Summarize Model Performance: Linear, Ridge

In [None]:
from model_metrics import summarize_model_performance

regression_metrics = summarize_model_performance(
    model=[linear_model, ridge_model],
    model_title=["Linear Regression", "Ridge Regression"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    model_type="regression",
    return_df=True,
    decimal_places=2,
)

regression_metrics

The output below presents a detailed comparison of the performance and coefficients for two regression models: Linear Regression and Ridge Regression trained on the diabetes dataset. It includes overall metrics such as Mean Absolute Error (MAE), Mean Absolute Percentage Error (MAPE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), Explained Variance, and $R^{2}$ Score for each model, showing their predictive accuracy. Additionally, it lists the coefficients for each feature (e.g., age, bmi, s1–s6) in both models, highlighting how each variable contributes to the prediction.

#### Regression Example 2: Summarize Model Performance: Linear, Ridge, RF (w/ Feature Importance)

In this Regression Example 2, we extend the analysis by introducing a Random Forest Regressor alongside Linear Regression and Ridge Regression to demonstrate how a model with feature importances, rather than coefficients, impacts evaluation outcomes. The code uses the `summarize_model_performance` function from `model_metrics` to assess all three models on the diabetes dataset’s test set, ensuring the Random Forest’s feature importance-based predictions are reflected in the results while preserving the coefficient-based results of the other models, as shown in the subsequent table.

In [None]:
from model_metrics import summarize_model_performance

regression_metrics = summarize_model_performance(
    model=[linear_model, ridge_model, rf_model],
    model_title=["Linear Regression", "Ridge Regression", "Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    model_type="regression",
    return_df=True,
    decimal_places=2,
)

regression_metrics

#### Regression Example 3: Summarize Model Performance (Adjusted $R^2$)

In some regression analyses, it is useful to report **Adjusted R²** in addition 
to standard error and variance metrics. Adjusted R² accounts for the number of 
predictors in the model and penalizes unnecessary complexity, making it more 
appropriate than R² when comparing models with different feature counts.

In [None]:
from model_metrics import summarize_model_performance

regression_metrics = summarize_model_performance(
    model=[linear_model, ridge_model, rf_model],
    model_title=["Linear Regression", "Ridge Regression", "Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    model_type="regression",
    return_df=True,
    include_adjusted_r2=True,
    decimal_places=2,
)

regression_metrics

#### Regression Example 4 - Summarize Model Performance (Overall Results)

In some scenarios, you may want to simplify the output by excluding variables, coefficients, and feature importances from the model results. This example demonstrates how to achieve that by setting `overall_only=True` in the `summarize_model_performance` function, producing a concise table that focuses on key metrics: model name, Mean Absolute Error (MAE), Mean Absolute Percentage Error (MAPE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), Explained Variance, and $R^2$ Score.

In [None]:
from model_metrics import summarize_model_performance

regression_metrics = summarize_model_performance(
    model=[linear_model, ridge_model, rf_model],
    model_title=["Linear Regression", "Ridge Regression", "Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    model_type="regression",
    overall_only=True,
    return_df=True,
    decimal_places=2,
)

regression_metrics

#### Regression example 5: Printed Table with Adjusted $R^2$

In this Regression Example 5, we illustrate how to generate a printed table that includes the Adjusted $R^2$ metric for regression models. By setting the `include_adjusted_r2` parameter to `True` in the `summarize_model_performance` function, we ensure that the output table provides a comprehensive view of model performance, including both $R^2$ and Adjusted $R^2$ values, which account for the number of predictors in the model. This allows for a more nuanced evaluation of model effectiveness, especially when comparing models with different numbers of features. Here, we toggle `return_df` to `False` (or simply do not pass it) to display the results directly as a printed table.

In [None]:
from model_metrics import summarize_model_performance

regression_metrics = summarize_model_performance(
    model=[linear_model, ridge_model, rf_model],
    model_title=["Linear Regression", "Ridge Regression", "Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    model_type="regression",
    decimal_places=2,
)

regression_metrics

## Residual Diagnostics

Residual diagnostics are essential tools for evaluating regression model performance beyond 
standard metrics like $R^2$ or RMSE. By examining the patterns in residuals: the differences 
between observed and predicted values, we can identify violations of modeling assumptions, 
detect systematic errors, and uncover opportunities for model improvement.

The `show_residual_diagnostics` function provides comprehensive visualization of residual 
patterns across multiple dimensions:

- **Residuals vs Fitted Values**: Assess homoscedasticity (constant variance) and identify non-linear patterns
- **Residuals vs Predictors**: Examine whether specific features are associated with systematic prediction errors
- **Q-Q Plots**: Evaluate whether residuals follow a normal distribution
- **Histogram of Residuals**: Visualize the distribution shape and identify outliers
- **Scale-Location Plots**: Detect heteroscedasticity (non-constant variance)

**What Good Residuals Look Like:**

- Randomly scattered around zero with no systematic patterns
- Constant spread across the range of fitted values (homoscedasticity)
- Approximately normally distributed (for inference and prediction intervals)
- No strong correlations with individual predictor variables

**What Bad Residuals Reveal:**

- **Funnel shapes** (heteroscedasticity): Variance increases/decreases with predicted values, suggesting transformations may be needed
- **Curved patterns**: Non-linear relationships that the model hasn't captured
- **Clusters or groups**: Systematic differences across subpopulations that may require interaction terms or stratified models
- **Heavy tails or skewness**: Outliers or violations of normality assumptions
- **Patterns vs predictors**: Missing interaction effects or non-linear relationships with specific features


### Residual Diagnostics Example 1: Comprehensive Residual Diagnostics Plots

This first example demonstrates a complete residual diagnostic analysis for a single 
regression model using `plot_type="all"`. This setting generates all available diagnostic 
visualizations in a single comprehensive display:

1. **Residuals vs Fitted Values**: Detects non-linearity, heteroscedasticity, and outliers
2. **Q-Q Plot**: Assesses normality of residuals
3. **Scale-Location Plot**: Evaluates homoscedasticity (constant variance)
4. **Residuals vs Leverage**: Identifies influential observations
5. **Histogram of Residuals**: Shows the distribution shape

We evaluate a Random Forest model trained on the diabetes dataset. The `n_clusters=3` 
parameter performs k-means clustering on the residuals to identify groups of observations 
with similar prediction error patterns. Setting `show_centroids=True` overlays cluster 
centers on the residual plots, styled with custom colors and markers via `centroid_kwgs`.

The `kmeans_rstate=222` parameter controls the random seed for k-means clustering, ensuring 
reproducible cluster assignments across repeated runs. By default, `kmeans_rstate` is set 
to 42, making clustering deterministic unless explicitly changed. This is important because 
k-means uses random initialization; different seeds can produce slightly different cluster 
assignments, especially when clusters overlap or are of similar size. Setting a fixed seed 
ensures that diagnostic plots remain consistent for documentation, presentations, and 
collaborative analysis.

To formally test for heteroscedasticity, we enable `heteroskedasticity_test="breusch_pagan"`. 
This optional parameter runs the Breusch-Pagan test, which evaluates whether residual 
variance is systematically related to predicted values. Test results, including the test 
statistic, *p*-value, and interpretation, are printed to the console. A significant result 
(*p* < 0.05) indicates heteroscedasticity, suggesting that predictions may be more reliable 
for certain ranges of the response variable than others.

Additional customization options include:

- `n_cols=2`: Arranges diagnostic plots in a 2-column grid layout
- `histogram_type="density"`: Displays residuals as a density plot rather than raw counts
- `decimal_places=2`: Controls precision of printed test statistics
- `tick_fontsize` and `label_fontsize`: Adjust text sizing for readability
- `save_plot=True` with image paths: Exports plots as PNG and SVG for reports

The function also returns a diagnostics dictionary containing residuals, fitted values, 
standardized residuals, and leverage statistics. This allows for programmatic access to 
diagnostic quantities for custom analyses or integration with `resid_diagnostics_to_dataframe` 
to convert results into a pandas DataFrame for further exploration.

In [None]:
linear_pred = linear_model.predict(X_test_diabetes)
rf_pred = rf_model.predict(X_test_diabetes)
ridge_pred = ridge_model.predict(X_test_diabetes)

In [None]:
from model_metrics import show_residual_diagnostics

show_residual_diagnostics(
    y_pred=rf_pred,
    model_title=["Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    n_clusters=3,
    n_cols=2,
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    tick_fontsize=12,
    label_fontsize=14,
    plot_type="all",
    show_centroids=True,
    centroid_kwgs={"c": ["red", "blue", "green"], "marker": "X", "s": 50},
    heteroskedasticity_test="breusch_pagan",
    decimal_places=2,
    histogram_type="density",
    kmeans_rstate=222,
)

### Residual Diagnostics Example 2: Singe Plot

This example demonstrates two key capabilities for focused residual analysis:

1. **Selective plot generation**: The `plot_type` parameter allows you to generate specific diagnostic plots rather than the full suite. Pass a single plot name as a string (e.g., `"fitted"`) or a list of plot names for multiple specific plots (e.g., `["fitted", "qq", "histogram"]`). This is useful when you need to examine particular model assumptions or create targeted visualizations for reports.

2. **LOWESS trend detection**: Setting `show_lowess=True` adds a locally weighted scatterplot smoothing (LOWESS) curve to residual plots. This non-parametric smoothing line reveals systematic patterns or trends in the residuals that might not be obvious from the scatter alone. If model assumptions hold, the LOWESS line should be roughly horizontal at y=0. Pronounced curves or trends indicate potential violations of linearity or suggest that the model is systematically over- or under-predicting in certain regions.

We focus on the Scale-Location plot (`plot_type="scale_location"`), which is particularly useful for detecting heteroscedasticity: the violation of the constant variance assumption. This plot displays the square root of standardized residuals against fitted values, making it easier to spot changes in residual spread across the prediction range. The LOWESS smoothing line, styled in orange via `lowess_kwgs`, helps identify whether variance increases, decreases, or remains stable as predictions change.
The `heteroskedasticity_test="breusch_pagan"` parameter formally tests for 
heteroscedasticity. The Breusch-Pagan test evaluates whether residual variance 
is systematically related to the predictors or fitted values. Test results appear 
in the plot legend (if space permits) or can be displayed in a diagnostic table 
using `show_diagnostics_table=True`. A significant result (*p* < 0.05) provides 
statistical evidence of heteroscedasticity, which may require remedial measures 
such as variance-stabilizing transformations, weighted least squares, or robust 
standard errors.

In [None]:
from model_metrics import show_residual_diagnostics

show_residual_diagnostics(
    y_pred=rf_pred,
    model_title=["Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    tick_fontsize=12,
    label_fontsize=14,
    figsize=(10, 8),
    plot_type="scale_location",
    point_kwgs={"alpha": 0.9, "color": "blue", "edgecolor": "black", "s": 50},
    show_lowess=True,
    lowess_kwgs={"color": "red", "lw": 2},
    show_centroids=True,
    heteroskedasticity_test="breusch_pagan",
    decimal_places=2,
    histogram_type="density",
    kmeans_rstate=222,
    suptitle="",
)

### Residual Diagnostics Example 3: Diagnostics Table Only

This example demonstrates how to generate a comprehensive residual diagnostics summary 
table **without displaying plots**. By setting `show_plots=False` and 
`show_diagnostics_table=True`, the function outputs only a tabular summary of key 
diagnostic statistics and heteroscedasticity test results.

The diagnostics table includes:

- **Residual statistics**: Mean, standard deviation, min, max, and quartiles
- **Standardized residual metrics**: Useful for identifying outliers ($|z| > 3$)
- **Heteroscedasticity test results**: When `heteroskedasticity_test` is specified

In this example, we set `heteroskedasticity_test="all"` to run **all available tests**:

- **Breusch-Pagan**: Tests whether residual variance depends on predicted values
- **White**: A more general test that doesn't assume a specific functional form
- **Goldfeld-Quandt**: Compares variance between two subsamples

Each test returns a test statistic, p-value, and interpretation. The `decimal_places=5` 
parameter ensures high precision in the printed output, which is useful for reporting 
results in research papers or technical documentation.

The `return_diagnostics=True` parameter returns a dictionary containing all diagnostic 
quantities (residuals, fitted values, standardized residuals, leverage, etc.) for 
programmatic access or conversion to a DataFrame using `resid_diagnostics_to_dataframe`.

**Note:** You can also display **both the table and plots simultaneously** by setting 
`show_plots=True` and `show_diagnostics_table=True` together. This provides a 
comprehensive view combining visual diagnostics with quantitative summaries, ideal for 
thorough model evaluation reports.

Additional parameters used:

- `plot_type="histogram"`: Specifies which plot type to generate (only relevant if `show_plots=True`)
- `n_clusters=3` and `show_centroids=True`: Configures k-means clustering (applied to returned diagnostics)
- `save_plot=True`: Would save plots if `show_plots=True`
m

In [None]:
from model_metrics import show_residual_diagnostics

diagnostics = show_residual_diagnostics(
    y_pred=rf_pred,
    model_title=["Random Forest"],
    X=X_test_diabetes,
    y=y_test_diabetes,
    n_clusters=3,
    n_cols=2,
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    tick_fontsize=12,
    label_fontsize=14,
    plot_type="histogram",
    show_centroids=True,
    centroid_kwgs={"c": ["red", "blue", "green"], "marker": "X", "s": 50},
    heteroskedasticity_test="all",
    legend_loc="upper right",
    show_diagnostics_table=True,
    return_diagnostics=True,
    show_plots=False,
    decimal_places=5,
)

### Residual Diagnostics Example 4: Diagnostics to DataFrame

Building on Example 2, the diagnostics dictionary returned by `show_residual_diagnostics()` 
can be converted into a pandas DataFrame for programmatic analysis, reporting, or integration 
into automated pipelines. 

The `resid_diagnostics_to_dataframe()` helper function handles 
this conversion seamlessly, properly flattening nested structures like heteroskedasticity 
test results. Unlike the console table which only displays *p*-values and interpretations, 
the DataFrame provides complete test results including both the test statistics and 
*p*-values: useful for creating custom reports, academic papers, or detailed model 
documentation that requires full statistical disclosure.

In [None]:
from model_metrics import resid_diagnostics_to_dataframe

df = resid_diagnostics_to_dataframe(diagnostics)
df

### Residual Diagnostics Example 5: Grouped Analysis with Customization

A particularly powerful extension of residual diagnostics is stratification by categorical 
variables such as demographic groups, treatment arms, or geographic regions. By examining 
residuals separately for each subgroup, we can identify whether:

- Model performance is consistent across populations
- Systematic bias exists for specific groups
- Variance differs across subgroups (heteroscedasticity by group)
- Interaction effects between predictors and group membership are present

This is especially critical in applications where fairness and equity matter such as 
healthcare, lending, and social services, where models should not systematically  
under-predict or over-predict for protected or vulnerable populations.

This example demonstrates how to examine residual patterns across categorical subgroups 
using the `group_category` parameter. By stratifying residual diagnostics by a 
categorical variable: such as sex, age group, or treatment arm, we can identify whether 
model errors are consistent across subpopulations or if certain groups exhibit systematic 
bias or heteroscedasticity.

In this example, we evaluate three regression models trained on the diabetes dataset: 
Linear Regression, Ridge Regression, and Random Forest. The `sex` variable in the 
original dataset is encoded numerically (positive/negative values), so we first transform 
it into interpretable categories ("Male" and "Female") before passing it to 
`show_residual_diagnostics`.    
The `plot_type="predictors"` option generates residual plots for each predictor variable, 
with points color-coded by the categorical group. This allows us to visually assess whether:

- Residuals are centered around zero for both groups
- Variance is similar across groups (homoscedasticity)
- Any systematic patterns exist that might indicate interaction effects or model misspecification

When `show_centroids=True` is enabled, group centroids are overlaid on the plots to 
highlight the mean residual behavior for each subgroup. The `centroid_kwgs` parameter 
allows customization of these centroids with specific colors, markers, and sizes to 
distinguish between groups clearly.

This type of analysis is particularly valuable in healthcare and social science applications 
where fairness and equity are critical concerns. Identifying residual patterns by demographic 
variables can reveal whether a model's predictions are systematically biased against 
specific subpopulations, informing decisions about model refinement or the need for 
group-specific calibration.

In [None]:
# The 'sex' column is already categorical-like (coded as positive/negative values)
# Let's make it more interpretable
X_test_diab_copy = X_test_diabetes.copy()
X_test_diab_copy["sex_category"] = X_test_diab_copy["sex"].apply(
    lambda x: "Male" if x > 0 else "Female"
)

In [None]:
X_test_diab_copy

In [None]:
from model_metrics import show_residual_diagnostics

show_residual_diagnostics(
    y_pred=rf_pred,
    model_title="Random Forest",
    X=X_test_diab_copy[["age", "bmi", "sex_category"]],
    y=y_test_diabetes,
    plot_type="predictors",
    show_centroids=True,
    centroid_type="groups",
    group_category="sex_category",
    centroid_kwgs={"c": ["red", "blue"], "marker": "X", "s": 50},
    group_kwgs={
        "color": ["#1f77b4", "#ff7f0e"],  # Custom hex colors
        "alpha": 0.8,
        "s": 60,
        "edgecolors": "black",
    },
    # legend_loc="bottom",
    heteroskedasticity_test="all",
    # figsize=(12, 8),
    tick_fontsize=14,
    label_fontsize=16,
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    suptitle="",
    decimal_places=2,
    kmeans_rstate=222,
    # legend_kwgs={
    #     "show_groups": True,
    #     "show_centroids": False,
    #     "show_het_tests": True,
    #     "show_cooks": True,
    # },
    # legend_kwgs=False,
    legend_loc="bottom",
    xlim=(-0.15, 0.15),
)

### Residual Diagnostics Example 6: Multiple Models with Shared Axes

This example demonstrates how to compare residual diagnostics across **multiple models** with 
**standardized axis limits** for direct visual comparison. By passing a list of 
predictions and model names, you can evaluate multiple models side-by-side using any 
diagnostic plot type.

Key features demonstrated:

**Model Comparison Across Any Plot Type**
   While this example uses histograms (`plot_type="histogram"`), the same approach 
   works for **all diagnostic plot types**:
   
   - **Residuals vs Fitted** (`plot_type="fitted"`): Compare linearity assumptions
   - **Q-Q Plots** (`plot_type="qq"`): Compare normality of residuals
   - **Scale-Location** (`plot_type="scale_location"`): Compare homoscedasticity
   - **Leverage Plots** (`plot_type="leverage"`): Compare influential observations
   - **All plots** (`plot_type="all"`): Generate all 6 diagnostic plots for each model
   
   Simply change the `plot_type` parameter while keeping the same multi-model structure 
   (`y_pred=[model1, model2]`) to create comprehensive cross-model comparisons for any 
   diagnostic.

**Shared Axis Limits**
   - `xlim=(-175, 175)`: Standardizes the x-axis (residual values) across both models
   - `ylim=(0, 10)`: Standardizes the y-axis (frequency counts) across both models
   
   This ensures that visual differences in residual distributions reflect actual model 
   performance rather than different axis scales.

**Group-Based Analysis**
   The `group_category="sex_category"` parameter colors points by sex, with custom 
   styling via `group_kwgs`. When combined with `show_centroids=True` and 
   `centroid_type="groups"`, group-specific centroids are displayed to reveal 
   whether residual patterns differ across demographic groups.

**Legend Positioning**
   Setting `legend_loc="bottom"` places legends below the x-axis with proper spacing. 
   The function automatically adds vertical space to accommodate bottom legends when 
   using default figure sizes.

**Comprehensive Heteroscedasticity Testing**
   `heteroskedasticity_test="all"` runs all four available tests (Breusch-Pagan, 
   White, Goldfeld-Quandt, Spearman) and displays results in the legend. This helps 
   identify whether residual variance is constant across fitted values.

**Title and Layout Customization**
   - `suptitle=""`: Suppresses the overall figure title for a cleaner look
   - `text_wrap=35`: Wraps subplot titles at 35 characters
   - `n_cols=2`: Arranges subplots in 2 columns for side-by-side comparison

**When to Use Multi-Model Comparison**
   Multi-model comparison is particularly valuable when:
   
   - Comparing different algorithms (e.g., linear vs tree-based models)
   - Evaluating hyperparameter tuning results (e.g., different regularization strengths)
   - Assessing feature engineering impact (e.g., with vs without transformations)
   - Creating model selection documentation for reports or publications

**When to Use Shared Axes**
   Shared axis limits (`xlim`/ `ylim`) are recommended when models have similar scales. 
   If models produce residuals on very different scales, omit these parameters to let 
   each subplot use optimal ranges.

In [None]:
from model_metrics import show_residual_diagnostics

show_residual_diagnostics(
    y_pred=[ridge_pred, rf_pred],
    model_title=["Ridge Regression", "Random Forest"],
    X=X_test_diab_copy[["age", "bmi", "sex_category"]],
    y=y_test_diabetes,
    plot_type="histogram",
    # histogram_type="density",
    show_centroids=True,
    centroid_type="groups",
    group_category="sex_category",
    centroid_kwgs={"c": ["red", "blue"], "marker": "X", "s": 50},
    group_kwgs={
        "color": ["#1f77b4", "#ff7f0e"],  # Custom hex colors
        "alpha": 0.8,
        "s": 60,
        "edgecolors": "black",
    },
    # legend_loc="bottom",
    heteroskedasticity_test="all",
    figsize=(12, 6),
    tick_fontsize=14,
    label_fontsize=16,
    # show_lowess=True,
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    suptitle="",
    n_cols=2,
    kmeans_rstate=222,
    # legend_kwgs={
    #     "show_groups": True,
    #     "show_centroids": False,
    #     "show_het_tests": True,
    #     "show_cooks": True,
    # },
    # legend_kwgs=False,
    text_wrap=35,
    # n_rows=1,
    legend_loc="bottom",
    xlim=(-175, 175),
    ylim=(0, 10),
)

## Lift Charts

This section illustrates how to assess and compare the ranking effectiveness of
classification models using Lift Charts, a valuable tool for evaluating how well 
a model prioritizes positive instances relative to random chance. Leveraging the 
Logistic Regression, Decision Tree, and Random Forest Classifier models trained 
on the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification), 
we plot Lift curves to visualize their relative ability to surface high-value (positive) cases at the top of the prediction list.

A Lift Chart plots the ratio of actual positives identified by the model compared 
to what would be expected by random selection, across increasingly larger proportions 
of the sample sorted by predicted probability. The baseline (Lift = 1) represents 
random chance; curves that rise above this line demonstrate the model's ability to
"lift" positive outcomes toward the top ranks. This makes Lift Charts especially
useful in applications like marketing, fraud detection, and risk stratification where
targeting the top segment of predictions can yield outsized value. 
See the [mathematical definition of Lift here](https://lshpaner.github.io/model_metrics_docs/conceptual_notes.html#lift-mathematical-definition).

The `show_lift_chart` function enables flexible creation of Lift Charts for one or more 
models. It supports single-plot overlays, subplot layouts, and detailed customization of 
labels, titles, and styling. Designed for both exploratory analysis and stakeholder 
presentation, this utility helps users better understand model ranking performance 
across the population.

### Lift Chart Example 1: Subplot Layout

In this first Lift Chart example, we evaluate and compare the ranking performance of two classification models: Logistic Regression and Random Forest Classifier trained on the synthetic dataset from the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification). The chart displays Lift curves for both models in a two-column subplot layout (`n_cols=2, n_rows=1`), enabling side-by-side comparison of how effectively each model prioritizes positive cases.

Each plot shows the model’s Lift across increasing portions of the test set, with a grey dashed line at Lift = 1 indicating the baseline (random performance). Curves above this line reflect the model’s ability to identify more positives than would be expected by chance. The Random Forest produces a steeper initial lift, demonstrating greater concentration of positive cases near the top-ranked predictions.

The `show_lift_chart` function allows for rich customization, including plot dimensions, axis font sizes, and curve styling. In this example, we set the line widths for both models and saved the plots in both PNG and SVG formats for further reporting or documentation.

In [None]:
from model_metrics import show_lift_chart

# Plot Lift chart
show_lift_chart(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    overlay=False,
    model_title=model_titles,
    save_plot=True,
    linestyle_kwgs={"color": "red", "linestyle": "--", "linewidth": 2},
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    subplots=True,
    figsize=(12, 6),
    label_fontsize=16,
    tick_fontsize=14,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
)

### Lift Chart Example 2: Overlay

This example overlays Lift curves from two classification models: Logistic Regression and Random Forest Classifier on a single plot for direct visual comparison. Both models were trained on the same synthetic dataset from the Binary Classification Models section, and their lift performance is evaluated on the shared test set.

The Lift curve shows how many more positive outcomes are captured by the model at each quantile compared to a random baseline. A horizontal dashed black line at Lift = 1 represents random selection; curves above this line indicate effective ranking of positive cases. Overlaying curves makes it easier to assess which model better concentrates true positives near the top of the prediction list.

Using the `overlay=True` option, the `show_lift_chart` function generates a clean, unified plot. Each curve is styled with `linewidth=2` for clarity, and all axis elements and tick marks are sized for presentation-quality output. This layout is particularly helpful for slide decks, performance reports, or model selection discussions.

In [None]:
from model_metrics import show_lift_chart

# Plot Lift chart
show_lift_chart(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    overlay=True,
    model_title=["Logistic Regression", "Random Forest"],
    linestyle_kwgs={"color": "red", "linestyle": "--", "linewidth": 2},
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    figsize=(14, 10),
    label_fontsize=16,
    tick_fontsize=14,
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
)

## Gain Charts

This section explores how to evaluate the **cumulative performance** of classification models in identifying positive outcomes using **gain charts**. These charts are especially effective at showing the model’s ability to concentrate the correct (positive) predictions in the top-ranked portion of the dataset. Using the same Logistic Regression, Decision Tree, and Random Forest Classifier models trained on the synthetic dataset introduced in the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification), we demonstrate how to plot and compare Gain Curves across models.


A **gain chart** shows the cumulative percentage of actual positive cases captured
as we move through the population sorted by predicted probability. Unlike the Lift Chart,
which displays the ratio of model performance over baseline, the Gain Chart directly shows
the percentage of positives captured, providing a more intuitive sense of how effective a model is
at identifying positives early in the ranked list.

### Gain Chart Example 1: Subplot Layout

In this first Gain Chart example, we compare the cumulative gain performance of two classification models:
Logistic Regression and Random Forest Classifier. This visualization showcases their ability to identify positive instances across different percentiles
of the ranked test data.

Each subplot presents the **cumulative gain** achieved as a function of the percentage of the sample, sorted 
by descending predicted probability. The grey dashed line represents the **baseline (random gain)**. A model 
that identifies a high proportion of positive cases in the early part of the ranking will have a steeper and 
higher curve. In this example, the Random Forest model outpaces Logistic Regression, indicating 
better early identification of positives.

The `show_gain_chart` function allows flexible styling and layout control. This example uses a subplot 
configuration (`n_cols=2, n_rows=1`), customized line widths and colors, and includes saving the figure 
for documentation or stakeholder presentations.

In [None]:
from model_metrics import show_gain_chart

show_gain_chart(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    figsize=(12, 6),
    model_title=["Logistic Regression", "Random Forest"],
    linestyle_kwgs={"color": "red", "linestyle": "--"},
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    tick_fontsize=14,
    label_fontsize=16,
    save_plot=True,
    subplots=True,
    show_gini=False,
)

### Gain Chart Example 2: Displaying Gini Coefficients

This example demonstrates how to include Gini coefficients directly in the gain chart legends using 
the `show_gini=True` parameter. The Gini coefficient is a summary statistic derived from the area 
under the gain curve (AUGC), calculated as 2 × AUGC - 1, and ranges from 0 to 1 where higher values 
indicate better model discrimination.

Both models: Logistic Regression and Random Forest Classifier were trained on the synthetic data from the 
[Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification). 
By enabling `show_gini=True` (and optionally setting `decimal_places=3`), each model's legend 
entry automatically displays its Gini coefficient, providing both visual and quantitative performance 
comparison in a single view.

The Gini coefficient complements the visual gain curve by offering a single number that summarizes 
discriminative power. In this example, both the curve shape and the Gini value help identify which 
model better concentrates positive cases at the top of the predicted ranking. This is particularly 
useful in presentations, model selection discussions, and performance reporting where stakeholders 
need both graphical intuition and numeric metrics.

In [None]:
from model_metrics import show_gain_chart

show_gain_chart(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=["Logistic Regression", "Random Forest"],
    linestyle_kwgs={"color": "red", "linestyle": "--"},
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    subplots=True,
    figsize=(12, 6),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    show_gini=True,
    decimal_places=3,
)

### Gain Chart Example 3: Overlay

This example overlays Gain curves from two classification models: Logistic
Regression and Random Forest Classifier on a single plot to enable direct
visual comparison of their cumulative gain performance. 

The Gain curve shows the cumulative proportion of true positives captured as 
you move through the population, ranked by predicted probability. A diagonal 
baseline line from (0, 0) to (1, 1) indicates the expected performance of a 
random model. Curves that rise above this line demonstrate superior model 
ability to concentrate positive cases near the top of the ranked list.

By setting `overlay=True`, the `show_gain_chart` function produces a single, 
easy-to-read plot containing both models' gain curves. Each curve is styled 
with `linewidth=2` for clear visibility. Overlay layouts are ideal for model 
selection discussions, presentations, and performance dashboards.


In [None]:
from model_metrics import show_gain_chart

show_gain_chart(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    overlay=True,
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    model_title=["Logistic Regression", "Random Forest"],
    linestyle_kwgs={"color": "red", "linestyle": "--", "linewidth": 2},
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
)

## ROC AUC Curves

The `show_roc_curve` function provides flexible and highly customizable plotting of ROC curves for binary classification models. It supports overlays, subplot layouts, and subgroup visualizations, while also allowing export options and styling hooks for publication-ready output.

Using the Logistic Regression and 
Random Forest Classifier models trained on the synthetic dataset introduced in the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification), we generate ROC curves to visualize their discriminatory power.

ROC AUC (Receiver Operating Characteristic Area Under the Curve) provides a 
single scalar value representing a model's ability to distinguish between 
positive and negative classes, with a value of 1 indicating perfect classification 
and 0.5 representing random guessing. The curves are plotted by varying the 
classification threshold and calculating the true positive rate (sensitivity) 
against the false positive rate (1-specificity). This makes ROC AUC particularly 
useful for comparing models like Logistic Regression, which relies on linear 
decision boundaries, and Random Forest Classifier, which leverages ensemble 
decision trees, especially when class imbalances or threshold sensitivity are 
concerns. The `show_roc_curve` function simplifies this process, enabling 
users to visualize and compare these curves effectively, setting the stage for 
detailed performance analysis in subsequent examples.

The `show_roc_curve` function provides a flexible and powerful way to visualize 
the performance of binary classification models using Receiver Operating Characteristic 
(ROC) curves. Whether you're comparing multiple models, evaluating subgroup fairness, 
or preparing publication-ready plots, this function allows full control over layout,
styling, and annotations. It supports single and multiple model inputs, optional overlay 
or subplot layouts, and group-wise comparisons via a categorical feature. Additional options 
allow custom axis labels, AUC precision, curve styling, and export to PNG/SVG. 
Designed to be both user-friendly and highly configurable, `show_roc_curve` 
is a practical tool for model evaluation and stakeholder communication.

### ROC AUC Example 1: Subplot Layout

In this first ROC AUC evaluation example, we plot the ROC curves for two 
models: Logistic Regression and Random Forest Classifier. The curves are displayed side by side 
using a subplot layout (`n_cols=2, n_rows=1`), with the Logistic Regression curve 
in blue and the Random Forest curve in green for clear differentiation. 
A red dashed line represents the random guessing baseline. This example 
demonstrates how the `show_roc_curve` function enables straightforward 
visualization of model performance, with options to customize colors, 
add a grid, and save the plot for reporting purposes.

In [None]:
from model_metrics import show_roc_curve

show_roc_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    figsize=(12, 6),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    decimal_places=2,
    n_cols=2,
    n_rows=1,
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    linestyle_kwgs={"color": "red", "linestyle": "--"},
    subplots=True,
)

### ROC AUC Example 2: Overlay

In this second ROC AUC evaluation example, we focus on overlaying the results of 
two models: Logistic Regression and Random Forest Classifier onto a single plot. Using the `show_roc_curve` function with the `overlay=True` parameter, the ROC curves for both models are 
displayed together, with Logistic Regression in blue and Random Forest in black, 
both with a `linewidth=2`. A red dashed line serves as the random guessing 
baseline, and the plot includes a custom title for clarity.

In [None]:
from model_metrics import show_roc_curve

show_roc_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    decimal_places=2,
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    linestyle_kwgs={"color": "red", "linestyle": "--"},
    title="ROC Curves: Logistic Regression and Random Forest",
    overlay=True,
)

### ROC AUC Example 3: DeLong's Test

In this third ROC AUC evaluation example, we demonstrate how to statistically
compare the performance of two correlated models using Hanley & McNeil's
parametric AUC comparison (an approximation of DeLong's test). We utilize the
Logistic Regression and Random Forest Classifier models. By passing their 
predicted probabilities to the `delong` parameter of the `show_roc_curve` function, we can assess whether the difference in AUC between the two models is statistically significant. This is particularly useful when models are evaluated on the same dataset, as it accounts for the inherent correlation in their predictions.   

In [None]:
from model_metrics import show_roc_curve

show_roc_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    decimal_places=2,
    delong=[model1.predict_proba(X_test)[:, 1], model2.predict_proba(X_test)[:, 1]],
)

### ROC AUC Example 4: Hanley Mcneil AUC Test

In [None]:
from model_metrics import hanley_mcneil_auc_test

# Compare two models' ROC-AUC scores
hanley_mcneil_auc_test(
    y_test,
    model1.predict_proba(X_test)[:, 1],
    model2.predict_proba(X_test)[:, 1],
    model_names=["Logistic Regression", "Random Forest"],
    verbose=True,
    decimal_places=6,
)

### ROC AUC Example 5: Operating Point Using Youden's J

In this fifth ROC AUC evaluation example, we again use the well-known
*Adult Income* dataset, a widely adopted benchmark for binary classification.
Its combination of categorical and numerical predictors makes it well suited for
both performance evaluation and interpretability analyses.

To train and evaluate the model, we rely on the `model_tuner` library.
[Click here to view the corresponding codebase for this workflow](https://lshpaner.github.io/model_metrics_docs/model_training.html#adult-income-training).

The objective of this example is to **identify and visualize an optimal operating point**
on the ROC curve using **Youden's J statistic**, defined as:

$$
   J = \text{TPR} - \text{FPR}
$$

This criterion selects the threshold that maximizes the vertical distance between
the ROC curve and the random-guess diagonal, providing a balanced tradeoff between
sensitivity and specificity.

The `show_roc_curve` function supports this directly via the
`show_operating_point` and `operating_point_method` parameters.

In the example below, we compute the ROC curve for a decision tree classifier
and annotate the optimal operating point determined by Youden's J statistic.

In [None]:
from model_metrics import show_roc_curve

show_roc_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    figsize=(12, 6),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    model_title=model_titles,
    decimal_places=2,
    show_operating_point=True,
    subplots=True,
    operating_point_method="youden",
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    linestyle_kwgs={"color": "red", "linestyle": "--"},
    operating_point_kwgs={
        "marker": "o",
        "color": "red",
        "s": 100,
    },
)

### ROC AUC Example 6: Closest to Top Left

In this example, we demonstrate an alternative method for identifying an optimal operating point
on the ROC curve using the **closest-to-top-left** criterion. Like Youden's J statistic, this
approach seeks a balanced threshold, but instead of maximizing the vertical distance from the
diagonal, it minimizes the Euclidean distance to the ideal point (0, 1) in ROC space.

The closest-to-top-left method finds the threshold that minimizes:

$$
   d = \sqrt{(1 - \text{TPR})^2 + \text{FPR}^2}
$$

This geometric criterion is particularly useful when you want to prioritize proximity to perfect
classification (top-left corner) rather than maximizing the difference between true positive
and false positive rates.

In this ROC AUC evaluation example, we focus on the results of 
two models: Logistic Regression and Random Forest Classifier, trained on the synthetic dataset from the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification).

The ``show_roc_curve`` function supports this method through the ``operating_point_method`` parameter
by setting it to ``"closest_topleft"``. In the example below, we compute the ROC curve for a
decision tree classifier and annotate the optimal operating point using the closest-to-top-left
criterion.

In [None]:
from model_metrics import show_roc_curve

show_roc_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    figsize=(12, 6),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    model_title=model_titles,
    decimal_places=2,
    show_operating_point=True,
    subplots=True,
    operating_point_method="closest_topleft",
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    linestyle_kwgs={"color": "red", "linestyle": "--"},
    operating_point_kwgs={
        "marker": "o",
        "color": "red",
        "s": 100,
    },
)

### ROC AUC Example 7: by Category

In this seventh ROC AUC evaluation example, we utilize the well-known 
*Adult Income* dataset, a widely used benchmark for binary classification 
tasks. Its rich mix of categorical and numerical features makes it particularly 
suitable for analyzing model performance across different subgroups.

To build and evaluate our models, we use the `model_tuner` library. 

[Click here to view the corresponding codebase for this workflow](https://lshpaner.github.io/model_metrics_docs/model_training.html).

The objective here is to assess ROC AUC scores not just overall, but 
**across each category of a selected feature**, such as *occupation*, 
*education*, *marital-status*, or *race*. This approach enables deeper insight into how 
performance varies by subgroup, which is particularly important for fairness, 
bias detection, and subgroup-level interpretability.

The `show_roc_curve` function supports this analysis through the 
`group_category` parameter. 

For example, by passing `group_category=X_test_2["race"]`, 
you can generate a separate ROC curve for each unique racial group in the dataset:

In [None]:
from model_metrics import show_roc_curve

show_roc_curve(
    model=model_rf["model"].estimator,
    X=X_test_ai,
    y=y_test_ai,
    model_title="Random Forest Classifier",
    decimal_places=2,
    group_category=X_test_2_ai["race"],
)

## Precision-Recal Curves

This section demonstrates how to evaluate the performance of binary classification 
models using Precision-Recall (PR) curves, a critical visualization for understanding 
model behavior in the presence of class imbalance. Using the Logistic Regression 
and Random Forest Classifier models trained on the 
synthetic dataset from the previous [(Binary Classification Models section)](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification), 
we generate PR curves to examine how well each model identifies true positives while limiting false positives.

Precision-Recall curves focus on the trade-off between **precision** 
(positive predictive value) and **recall** (sensitivity) across different 
classification thresholds. This is particularly important when the positive 
class is rare, as is common in fraud detection, disease diagnosis, or adverse 
event prediction, because ROC AUC can overstate performance under imbalance. 
Unlike the ROC curve, the PR curve is sensitive to the proportion of positive 
examples and gives a clearer picture of how well a model performs where it 
matters most: in identifying the positive class.

The **area under the Precision-Recall curve**, also known as Average Precision 
(AP), summarizes model performance across thresholds. A model that maintains high 
precision as recall increases is generally more desirable, especially in settings 
where false positives have a high cost. This makes the PR curve a complementary 
and sometimes more informative tool than ROC AUC in skewed classification scenarios.

### Precision-Recall Example 1: Subplot Layout

In this first Precision-Recall evaluation example, we plot the PR curves for two 
models: Logistic Regression and Random Forest Classifier.

The curves are arranged side by side using a subplot layout (``n_cols=2, n_rows=1``), 
with the Logistic Regression curve rendered in blue and the Random Forest curve 
in green to distinguish between models. A gray dashed line indicates the baseline 
precision, equal to the prevalence of the positive class in the dataset.

This example illustrates how the ``show_pr_curve`` function makes it easy to 
visualize and compare model performance when dealing with class imbalance. It 
also demonstrates layout flexibility and customization options, including gridlines, 
label styling, and export functionality, making it suitable for both exploratory 
analysis and final reporting.

In [None]:
from model_metrics import show_pr_curve

show_pr_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    figsize=(12, 6),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    decimal_places=2,
    n_cols=2,
    n_rows=1,
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    subplots=True,
)

### Precision-Recall Example 2: Overlay

In this second Precision-Recall evaluation example, we focus on overlaying the 
results of two models: Logistic Regression and Random Forest Classifier onto a single plot. 
Using the `show_pr_curve` function with the `overlay=True` parameter, the Precision-Recall curves for 
both models are displayed together, with Logistic Regression in blue and Random 
Forest in black, both with a `linewidth=2`. The plot includes a custom title 
for clarity.

In [None]:
from model_metrics import show_pr_curve

show_pr_curve(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    decimal_places=2,
    n_cols=2,
    n_rows=1,
    curve_kwgs={
        "Logistic Regression": {"color": "blue", "linewidth": 2},
        "Random Forest": {"color": "black", "linewidth": 2},
    },
    title="ROC Curves: Logistic Regression and Random Forest",
    overlay=True,
)

### Precision-Recall Example 3: Categorical

In this third Precision-Recall evaluation example, we utilize the well-known 
*Adult Income* dataset, a widely used benchmark for binary classification 
tasks. Its rich mix of categorical and numerical features makes it particularly 
suitable for analyzing model performance across different subgroups.

To build and evaluate our models, we use the `model_tuner` library. 

[Click here to view the corresponding codebase for this workflow](https://lshpaner.github.io/model_metrics_docs/model_training.html).


The objective here is to assess ROC AUC scores not just overall, but 
**across each category of a selected feature**, such as *occupation*, 
*education*, *marital-status*, or *race*. This approach enables deeper insight into how 
performance varies by subgroup, which is particularly important for fairness, 
bias detection, and subgroup-level interpretability.

The `show_pr_curve` function supports this analysis through the 
`group_category` parameter. 

For example, by passing `group_category=X_test_2["race"]`, 
you can generate a separate ROC curve for each unique racial group in the dataset:

In [None]:
from model_metrics import show_pr_curve

show_pr_curve(
    model=model_rf["model"].estimator,
    X=X_test_ai,
    y=y_test_ai,
    model_title="Random Forest Classifier",
    decimal_places=2,
    group_category=X_test_2_ai["race"],
)

## Confusion Matrix Evaluation

This section introduces the `show_confusion_matrix` function, which provides a 
flexible, styled interface for generating and visualizing confusion matrices 
across one or more classification models. It supports advanced features like 
threshold overrides, subgroup labeling, classification report display, and fully 
customizable plot aesthetics including subplot layouts.

The confusion matrix is a fundamental diagnostic tool for classification models, 
displaying the counts of true positives, true negatives, false positives, and 
false negatives. This function goes beyond standard implementations by allowing 
for custom thresholds (globally or per model), label annotation (e.g., TP, FP, etc.), 
plot exporting, colorbar toggling, and subplot visualization.

This is especially useful when comparing multiple models side-by-side or needing 
publication-ready confusion matrices for stakeholders.

### Confusion Matrix Example 1: Threshold=0.5

In this first confusion matrix evaluation example, we focus on showing the
results of two models: Logistic Regression and Random Forest Classifier, trained
on the synthetic dataset from the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification) onto a single plot.

In [None]:
from model_metrics import show_confusion_matrix

show_confusion_matrix(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    # cmap="viridis",
    text_wrap=40,
    # title="Custom",
    save_plot=True,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    n_cols=2,
    n_rows=1,
    figsize=(6, 6),
    show_colorbar=False,
    label_fontsize=16,
    tick_fontsize=12,
    inner_fontsize=14,
    subplots=True,
)

### Confusion Matrix Example 2: Classification Report

In [None]:
from model_metrics import show_confusion_matrix

show_confusion_matrix(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    cmap="viridis",
    text_wrap=40,
    subplots=True,
    n_cols=2,
    n_rows=1,
    # figsize=(6, 6),
    tick_fontsize=14,
    inner_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    show_colorbar=True,
    class_report=True,
)

### Confusion Matrix Example 3: Threshold = 0.37

In this third confusion matrix evaluation example using the synthetic dataset 
from the [Binary Classification Models section](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification), we apply 
a custom classification threshold of 0.37 using the `custom_threshold` parameter. 
This overrides the default threshold of 0.5 and enables us to inspect how the 
confusion matrices shift when a more lenient decision boundary is applied. Refer 
to the section on [threshold selection logic](https://lshpaner.github.io/model_metrics_docs/conceptual_notes.html#threshold-selection-logic)
for caveats on choosing the right threshold.

This is especially useful in imbalanced classification problems or cost-sensitive 
environments where the trade-off between precision and recall must be adjusted. 
By lowering the threshold, we increase the number of positive predictions, 
which can improve recall but may come at the cost of more false positives.

The output matrices for both models: Logistic Regression and Random Forest are shown 
side by side in a subplot layout for easy visual comparison.

In [None]:
from model_metrics import show_confusion_matrix

show_confusion_matrix(
    model=[model1, model2],
    X=X_test,
    y=y_test,
    model_title=model_titles,
    text_wrap=40,
    subplots=True,
    n_cols=2,
    n_rows=1,
    figsize=(6, 6),
    tick_fontsize=14,
    inner_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    custom_threshold=0.37,
)

## Calibration Curves

This section focuses on calibration curves, a diagnostic tool that compares 
predicted probabilities to actual outcomes, helping evaluate how well a model's 
predicted confidence aligns with observed frequencies. Using models like Logistic 
Regression or Random Forest on the synthetic dataset from the previous 
[(Binary Classification Models)](https://lshpaner.github.io/model_metrics_docs/performance_assessment.html#binary-classification) section, we generate 
calibration curves to assess the reliability of model probabilities.

Calibration is especially important in domains where probability outputs inform 
downstream decisions, such as healthcare, finance, and risk management. A 
well-calibrated model not only predicts the correct class but also outputs 
meaningful probabilities, for example, when a model predicts a 0.7 probability, 
we expect roughly 70% of such predictions to be correct.

The `show_calibration_curve` function simplifies this process by allowing users to 
visualize calibration performance across models or subgroups. The plots show the 
mean predicted probabilities against the actual observed fractions of positive 
cases, with an optional reference line representing perfect calibration. 
Additional features include support for overlay or subplot layouts, subgroup 
analysis by categorical features, and optional Brier score display, a scalar 
measure of calibration quality.

The function offers full control over styling, figure layout, axis labels, and 
output format, making it easy to generate both exploratory and publication-ready 
plots.

### Calibration Curve Example 1: Subplots

In [None]:
from model_metrics import show_calibration_curve

show_calibration_curve(
    model=pipelines_or_models[:2],
    X=X_test_ai,
    y=y_test_ai,
    model_title=model_titles[:2],
    text_wrap=50,
    bins=10,
    show_brier_score=True,
    figsize=(12, 6),
    subplots=True,
    linestyle_kwgs={"color": "black"},
)

### Calibration Curve Example 2: Overlay

This example also uses the well-known *Adult Income* dataset, a widely used 
benchmark for binary classification tasks. Its rich mix of categorical and 
numerical features makes it particularly suitable for analyzing model performance 
across different subgroups.

To train and evaluate the model, we rely on the `model_tuner` library.

[Click here to view the corresponding codebase for this workflow](https://lshpaner.github.io/model_metrics_docs/model_training.html#adult-income-training).

This example demonstrates how to overlay calibration curves from multiple classification 
models in a single plot. Overlaying allows for direct visual comparison of how predicted 
probabilities from each model align with actual outcomes on the same axes.

The diagonal dashed line represents perfect calibration, and Brier scores are included 
in the legend for each model, providing a quantitative measure of calibration accuracy.

By setting `overlay=True`, the function combines all model curves into one figure, 
making it easier to evaluate relative performance without splitting across subplots.

In [None]:
from model_metrics import show_calibration_curve

show_calibration_curve(
    model=pipelines_or_models,
    X=X_test_ai,
    y=y_test_ai,
    model_title=model_titles_ai,
    bins=10,
    figsize=(14, 10),
    show_brier_score=True,
    overlay=True,
    brier_decimals=4,
    linestyle_kwgs={"color": "black"},
)

### Calibration Curve Example 3: by Category 

In [None]:
from model_metrics import show_calibration_curve

show_calibration_curve(
    model=model_rf["model"].estimator,
    X=X_test_ai,
    y=y_test_ai,
    model_title="Random Forest Classifier",
    bins=10,
    show_brier_score=True,
    brier_decimals=4,
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    linestyle_kwgs={"color": "black"},
    curve_kwgs={title: {"linewidth": 2} for title in model_titles_ai},
    group_category=X_test_2_ai["race"],
)

## Threshold Metric Curves

This section introduces a powerful utility for exploring how classification 
thresholds affect key performance metrics, including **Precision**, **Recall**, 
**F1 Score**, and **Specificity**. Rather than fixing a threshold (commonly at 0.5),
this function allows users to visualize **trade-offs across the full range of 
possible thresholds**, making it especially useful when optimizing for use-case-specific 
goals such as maximizing recall or achieving a minimum precision.

Using the Random Forest Classifier models trained on the 
[adult income dataset](https://lshpaner.github.io/model_metrics_docs/model_training.html), 
this tool helps users answer practical questions like:

- *What threshold achieves at least 85% precision?*
- *Where does F1 score peak for this model?*
- *How does specificity behave as the threshold increases?*

The plot_threshold_metrics function supports optional threshold lookups via 
`lookup_metric` and `lookup_value`, which prints the closest threshold that
meets your constraint. Plots can be customized with colors, gridlines, line styles,
wrapped titles, and export options.

### Threshold Curves Example 1: Threshold=0.5

This example demonstrates how to plot threshold-dependent classification metrics .

The `plot_threshold_metrics` function visualizes how Precision, Recall, F1 Score, 
and Specificity change as the decision threshold varies. In this configuration, 
the baseline threshold line at 0.5 is enabled (`baseline_thresh=True`), 
and the line styling is customized via `curve_kwgs`. Font sizes and wrapping options 
are adjusted for improved clarity in presentation-ready plots.


In [None]:
from model_metrics import plot_threshold_metrics

plot_threshold_metrics(
    model=model_rf["model"].estimator,
    X_test=X_test_ai,
    y_test=y_test_ai,
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    baseline_thresh=True,
    baseline_kwgs={
        "color": "purple",
        "linestyle": "--",
        "linewidth": 2,
    },
    curve_kwgs={
        "linestyle": "-",
        "linewidth": 2,
    },
    text_wrap=40,
)

### Threshold Curves Example 2: Targeted Metric Lookup

This example expands on threshold-based classification metric visualization using 
a targeted lookup scenario. Suppose a clinical stakeholder or domain expert has 
determined (based on prior research, cost-benefit considerations, or operational
constraints) that a precision of approximately `0.879` is ideal for downstream 
decision-making (e.g., minimizing false positives in a healthcare setting).

The `plot_threshold_metrics` function accepts the optional arguments `lookup_metric` 
and `lookup_value` to help identify the threshold that best aligns with this target. 
When these are set, the function automatically locates and highlights the threshold 
that most closely achieves the desired metric value, offering transparency and 
guidance for threshold tuning.

In [None]:
from model_metrics import plot_threshold_metrics

plot_threshold_metrics(
    model=model_rf["model"].estimator,
    X_test=X_test_ai,
    y_test=y_test_ai,
    lookup_metric="precision",
    lookup_value=0.879,
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    baseline_thresh=False,
    lookup_kwgs={
        "color": "red",
        "linestyle": "--",
        "linewidth": 2,
    },
    curve_kwgs={
        "linestyle": "-",
        "linewidth": 2,
    },
    text_wrap=40,
)

### Threshold Curves Example 3: Model-Specific Threshold

In many production settings, a classifier is deployed with a tuned decision threshold different from the default 0.5 (e.g., to balance costs of false positives vs. false negatives).
This example shows how to **explicitly pass a model's chosen threshold** to be drawn as a vertical guide on the plot using `model_threshold=....`
You can do this whether you're providing a model/X pair or pre-computed probabilities via `y_prob`. Below we show the latter.

In [None]:
# Get predicted probabilities for Random Forest model
y_prob_rf = model_rf["model"].estimator.predict_proba(X_test_ai)[:, 1]

In [None]:
# Retrieve model thresholds
model_thresholds = {
    "Logistic Regression": next(iter(model_lr["model"].threshold.values())),
    "Decision Tree Classifier": next(iter(model_dt["model"].threshold.values())),
    "Random Forest Classifier": next(iter(model_rf["model"].threshold.values())),
}

In [None]:
from model_metrics import plot_threshold_metrics

# Example: Use precomputed probabilities but still highlight the model's tuned threshold.
plot_threshold_metrics(
    y_prob=y_prob_rf,  # precomputed probabilities for the positive class
    y_test=y_test_ai,  # ground-truth labels
    baseline_thresh=False,  # hide the default 0.5 guide
    model_threshold=model_thresholds["Random Forest Classifier"],
    figsize=(14, 10),
    tick_fontsize=14,
    label_fontsize=16,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plot=True,
    threshold_kwgs={  # styling for the model-threshold vertical line
        "color": "blue",
        "linestyle": "--",
        "linewidth": 2,
    },
    curve_kwgs={  # styling for metric curves
        "linestyle": "-",
        "linewidth": 2,
    },
    text_wrap=40,
)