In [None]:
%load_ext nb_black
# Load the "autoreload" extension so that code can change
%load_ext autoreload
# always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [None]:
import pandas as pd
import shap  # package used to calculate Shap values
from rfpimp import X_train2, predict, rf_random, test


# from category_encoders import OrdinalEncoder


## Model Interpretation

We can use Shapley values to find the local interpretation for a specific sample.  This means we can check the relative contribution of each feature to each store.
They can also provide a global interpretation by looking at it in a combined form.

In [None]:
# Create object that can calculate shap values
explainer = shap.TreeExplainer(rf_random.best_estimator_)

# Calculate Shap values
shap_values = explainer.shap_values(X_train2)

shap.initjs()

# define sample using the location id
i = 116

# produce a visualisation of the local interpretation
shap.force_plot(
    explainer.expected_value,
    shap_values[i],
    features=X_train2.loc[i],
    feature_names=X_train2.columns,
)

# this example shows that household_size, county, household_affluency and school_proximity all had a positive
# contribution to the output value, size of effect in that order
# crime_rate, property_value and public_transport_dist all had a negative effect, with size in that order

In [None]:
# global interpretation
shap.summary_plot(shap_values, features=X_train2,
                  feature_names=X_train2.columns)

# the chart below shows that having a small household_size has a negative effect for many stores, having a large
# household_size has a postive effect for fewer stores
# having a high household affluency has a negative effect for many many stores, having a low household affluency has a
# negative effect for slightly fewer stores

## Model Predictions

In [None]:
# predict2 = df_combined.loc[df_combined["data"] == "predict"].drop(
#     columns=["normalised_sales", "data"]
# )
predict2 = predict[
    [
        "household_size",
        "household_affluency",
        "county",
        "crime_rate",
        "public_transport_dist",
        "property_value",
        "school_proximity",
    ]
]

predict2.head(5)

In [None]:
# make predictions for the potential new store locations
preds = rf_random.predict(predict2)
# return dataframe containing all features with predictions
preds_df = pd.DataFrame(data=preds, columns=["preds"], index=test.index.copy())
preds_out = pd.merge(test, preds_df, how="left",
                     left_index=True, right_index=True)
preds_out.head(5)