In [15]:
import dalex as dx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('data.csv')
X = df.drop(['is_canceled'], axis=1)
y = df['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [3]:
forest = pickle.load(open('rf_pipe_enh.sav', 'rb'))

In [4]:
exp_forest = dx.Explainer(forest, X_train, y_train, label='random_forest')

Preparation of a new explainer is initiated

  -> data              : 95368 rows 25 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 95368 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : random_forest
  -> predict function  : <function yhat_proba_default at 0x0000021DF871C5E0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00676, mean = 0.371, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.8, mean = -0.00108, max = 0.965
  -> model_info        : package sklearn

A new explainer has been created!


In [10]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "total_of_special_requests"], 
                                           type = "partial")
forest_mprofile.plot()

Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.31it/s]


In [12]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "total_of_special_requests"],
                                           groups = "children",
                                           type = "partial")
forest_mprofile.plot()

Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.29it/s]


In [13]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "total_of_special_requests"],
                                           groups = "babies",
                                           type = "partial")
forest_mprofile.plot()

Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.24it/s]
