## Setup a regression experiment

In [15]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston()
feature_names = list(boston.feature_names)
df = pd.DataFrame(boston.data, columns=feature_names)
df["target"] = boston.target
# df = df.sample(frac=0.1, random_state=1)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label]

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)


Function load_boston is deprecated; `load_boston` is deprecated in 1.0 and will be removed in 1.2.

    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    data

## Explore the dataset

In [16]:
from interpret import show
from interpret.data import Marginal

marginal = Marginal().explain_data(X_train, y_train, name = 'Train Data')
show(marginal)

## Train the Explainable Boosting Machine (EBM)

In [18]:
from interpret.glassbox import ExplainableBoostingRegressor

ebm = ExplainableBoostingRegressor(random_state=seed, n_jobs=-1)
ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingRegressor(n_jobs=-1, random_state=1)

## Global Explanations: What the model learned overall (shows the top -15- most important features)

In [21]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [None]:
from interpret.visual.plot import sort_take

data_dict = sort_take(data_dict, sort_fn=lambda x: -abs(x), top_n=100, reverse_results=True)

## Additional Code to Compute Feature Group Importances and Append Feature Group Importances to a Global Explanation

In [22]:
import numpy as np

def get_feature_group_importance(feature_group, ebm, X, contributions=None):
    """Computes the feature importance for a group of features
 
    Args:
        feature_group: A list of feature names
        ebm: A fitted EBM
        X: Numpy array of samples
        contributions (optional): Contributions of all features per row
 
    Returns:
        A double
    """
    if contributions is None:
        _, contributions = ebm.predict_and_contrib(X)
    abs_sum_per_row = np.empty(len(contributions), np.float64)
 
    # For all rows in the dataset
    for i in range(len(contributions)):
        sum = 0.0
        # For all features in feature_group
        for j, feat_name in enumerate(ebm.get_feature_names_out()):
            if feat_name in feature_group:
                sum += contributions[i][j]
        abs_sum_per_row[i] = abs(sum)
    
    return np.average(abs_sum_per_row)
 
 
def get_group_and_individual_importances(feature_group, ebm, X):
    """Utility function to compute the feature importance for a group 
       of features as well as each feature in the group
 
    Args:
        feature_group: A list of feature names
        ebm: A fitted EBM
        X: Numpy array of samples
 
    Returns:
        A list of tuples, where each tuple is in the form (feature_name, importance)
    """
    _, contributions = ebm.predict_and_contrib(X)
    importances = []
 
    for feature in feature_group:
         importances.append( (feature, get_feature_group_importance(feature, ebm, X, contributions)) )
 
    importances.append( (feature_group, get_feature_group_importance(feature_group, ebm, X, contributions)) )
    
    return importances
 
def append_feature_importance(feature_name, feature_importance, global_exp):
    """ Appends a feature name and importance to the global explanation, which
        will only be displayed in the "Summary" Graph
 
    Args:
        feature_name (string)
        feature_importance (double)
        global_exp: An EBM Global Explanation
    """
    if global_exp._internal_obj is not None and global_exp._internal_obj["overall"] is not None:
        global_exp._internal_obj["overall"]["names"].append(feature_name)
        global_exp._internal_obj["overall"]["scores"].append(feature_importance)
    else:
        print("It was not possible to append feature {} to the global explanation.".format(feature_name))



## Add Feature Groups to Global Explanation 

In [23]:
ebm_global = ebm.explain_global(name='EBM')

feature_group1 = ["RM", "NOX"]
fg1_importance = get_feature_group_importance(feature_group1, ebm, X)
print("Importance for feature group {}: {}".format(feature_group1, fg1_importance))

feature_group2 = ["CHAS", "ZN"]
fg2_importance = get_feature_group_importance(feature_group2, ebm, X)
print("Importance for feature group {}: {}".format(feature_group2, fg2_importance))

Importance for feature group ['RM', 'NOX']: 2.0728997603879087
Importance for feature group ['CHAS', 'ZN']: 0.41325989100551636


In [24]:
from interpret.visual.plot import sort_take





In [None]:
append_feature_importance("RM & NOX", fg1_importance, ebm_global)
append_feature_importance("CHAS & ZN", fg2_importance, ebm_global)
show(ebm_global)