In [None]:
import os
from sklearn import set_config
set_config(display='diagram')
os.chdir("/Users/Matheus_Pinto/Desktop/quantumblack/base-ml-project")

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 50)


%load_ext kedro.ipython
%reload_kedro .



In [None]:
import shap
import typing as tp
import matplotlib.pyplot as plt
import plotly.express as px

def generate_shap_beeswarm_plot(shap_values: tp.List[list], max_display=20, cmap="turbo"):
    """
    Generate a SHAP beeswarm plot with custom size adjustment.

    Parameters:
      shap_values (numpy.ndarray): The SHAP values to be visualized.
      max_display (int): Maximum number of data points to display.

    Returns:
    - matplotlib.figure.Figure: The generated figure.
    """
    fig, _ = plt.subplots()
    shap.plots.beeswarm(
        shap_values,
        max_display=max_display,
        color=plt.get_cmap(cmap),
        show=False,
    )
    original_size = fig.get_size_inches()
    fig.set_size_inches(2 * original_size[0], 2 * original_size[0] * 3 / 4)
    plt.tight_layout()
    plt.title(f"{namespace} shap values", fontdict={"fontsize": 20})
    return fig


In [None]:
model = catalog.load(f"{namespace}.model_artifact")
df = catalog.load(f"{namespace}.train_dataset")

## Model Class

In [None]:
model

## Model Pipeline

In [None]:
model.model

In [None]:
preprocessor = model.model[:-1]
estimator = model.model[-1]
X_train = preprocessor.transform(df)
explainer = shap.Explainer(estimator, X_train)
shap_values = explainer(X_train)


## Shap values for train dataset

In [None]:

fig = generate_shap_beeswarm_plot(shap_values, max_display=20, cmap="coolwarm")


## Shap values for feature importance



In [None]:
df_importance = pd.DataFrame(shap_values.values, columns=preprocessor[1].columns)
df_importance = df_importance.abs().T
df_importance["shap_feature_importance"] = df_importance.sum(axis=1)
df_importance = df_importance[["shap_feature_importance"]]
df_importance["shap_feature_importance"] = df_importance["shap_feature_importance"] / df_importance["shap_feature_importance"].sum() * 100
df_importance.sort_values("shap_feature_importance", ascending=True, inplace=True)
df_importance.index.name = "feature"
df_importance = df_importance.reset_index()
df_plot = df_importance[df_importance["shap_feature_importance"] > 0]
df_less_importance = df_importance[df_importance["shap_feature_importance"] <= 0]
fig = px.bar(df_plot, orientation="h", x="shap_feature_importance", y="feature", title=f"{namespace} - Shap Feature Importance", color="shap_feature_importance", color_continuous_scale="ylorrd")
fig.show()


## Features with no shap feature importance

In [None]:
df_less_importance

## Feature importance from model (if available) 

In [None]:
try:
    try:

        df_imp = pd.DataFrame([preprocessor[0].columns, estimator.feature_importances_]).T
        df_imp.columns = ["feature", "importance"]
        df_imp = df_imp.sort_values("importance", ascending=True)
        df_plot = df_imp[df_imp["importance"] > 0]
        df_less_importance = df_imp[df_imp["importance"] <= 0]

        fig = px.bar(df_plot, orientation="h", x="importance", y="feature", title=f"{namespace} - Feature Importance",)
        fig.show()

    except Exception:

        df_imp = pd.DataFrame([preprocessor[0].columns, estimator.coef_]).T
        df_imp.columns = ["feature", "importance"]
        df_imp = df_imp.sort_values("importance", ascending=True)
        df_plot = df_imp[df_imp["importance"] > 0]
        df_less_importance = df_imp[df_imp["importance"] <= 0]

        fig = px.bar(df_plot, orientation="h", x="importance", y="feature", title=f"{namespace} - Feature Importance")
        fig.show()
except Exception as e:
    print("No feature importance available for these model")