# Code snippets for Python
## Explanatory Model Analysis
### 2020-07-19


![https://pbiecek.github.io/ema/figure/UMEPpiramide.png](https://pbiecek.github.io/ema/figure/UMEPpiramide.png)


# Chapter 5: Datasets and models

In [1]:
import dalex as dx
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
    (OneHotEncoder(), ['gender', 'class', 'embarked']))

## Logistic-regression model

In [2]:
from sklearn.linear_model import LogisticRegression

model_lr = make_pipeline(
    preprocess,
    LogisticRegression(penalty='l2'))
    
model_lr.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.floa

## Random-forest model

In [3]:
from sklearn.ensemble import RandomForestClassifier

model_rf = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth=3, random_state=0, n_estimators=500))
    
model_rf.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.floa

## Gradient-boosting model

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = make_pipeline(
    preprocess,
    GradientBoostingClassifier(n_estimators=100))

model_gbc.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.floa

## Support Vector Machine model for Classification

In [5]:
from sklearn.svm import SVC

model_svm = make_pipeline(
    preprocess,
    SVC(probability=True))
    
model_svm.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.floa

## Models’ predictions

In [6]:
import pandas as pd

johnny_d = pd.DataFrame({'gender': ['male'],
                       'age': [8],
                       'class': ['1st'],
                       'embarked': ['Southampton'],
                       'fare': [72],
                       'sibsp': [0],
                       'parch': [0]},
                      index = ['JohnnyD'])
model_lr.predict_proba(johnny_d)
model_rf.predict_proba(johnny_d)
model_gbc.predict_proba(johnny_d)
model_svm.predict_proba(johnny_d)

array([[0.7839913, 0.2160087]])

In [7]:
henry = pd.DataFrame({'gender': ['male'],
                       'age': [47],
                       'class': ['1st'],
                       'embarked': ['Southampton'],
                       'fare': [25],
                       'sibsp': [0],
                       'parch': [0]},
                      index = ['Henry'])

print(model_lr.predict_proba(henry))

print(model_rf.predict_proba(henry))

print(model_gbc.predict_proba(henry))

print(model_svm.predict(henry))

[[0.69547744 0.30452256]]
[[0.73060059 0.26939941]]
[[0.68760708 0.31239292]]
[0]


## Models’ explainers

In [8]:
exp_rf = dx.Explainer(model_rf, X, y, label = "Titanic RF Pipeline")

exp_lr = dx.Explainer(model_lr, X, y, label = "Titanic LR Pipeline")

exp_gbc = dx.Explainer(model_gbc, X, y, label = "Titanic XGB Pipeline")

exp_svm = dx.Explainer(model_svm, X, y, label = "Titanic SVM Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Argument 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Titanic RF Pipeline
  -> predict function  : <function yhat_proba_default at 0x116814d08> will be used (default)
  -> predicted values  : min = 0.171, mean = 0.322, max = 0.893
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.826, mean = 4.89e-05, max = 0.826
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Argument 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Titanic LR Pipel

# Chapter 7: Break-down Plots for Additive Attributions

## Examples

In [22]:
bd_john = exp_rf.predict_parts(johnny_d, type='break_down')
bd_john

<dalex.instance_level._break_down.object.BreakDown at 0x11ae12898>

In [None]:
bd_john.plot()

## Basic use of the perdict_parts() function

In [11]:
bd_henry = exp_rf.predict_parts(henry, type='break_down')
bd_henry
bd_henry.plot()

## Advanced use of the predict_parts() function

In [18]:
import numpy as np

bd_henry = exp_rf.predict_parts(henry, type='break_down', order=np.array([0, 2, 1, 3, 4, 5, 6]))
bd_henry.plot(max_vars=5)