# Imports

In [17]:
!pip install -U --ignore-installed -r requirements.txt

Collecting dash_core_components (from -r requirements.txt (line 1))
Collecting sklearn_pandas (from -r requirements.txt (line 2))
  Using cached https://files.pythonhosted.org/packages/1f/48/4e1461d828baf41d609efaa720d20090ac6ec346b5daad3c88e243e2207e/sklearn_pandas-1.8.0-py2.py3-none-any.whl
Collecting numpy (from -r requirements.txt (line 3))
  Using cached https://files.pythonhosted.org/packages/be/e8/45079ae05c4dda4a67bc51578ae5e75feda0a79c2836d477d676e7a58efb/numpy-1.17.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl
Collecting pandas (from -r requirements.txt (line 4))
  Using cached https://files.pythonhosted.org/packages/94/f0/3099fdb1ae94663561cd695b820f05b6f6d240c919ba179c076015de5e37/pandas-0.25.0-cp36-cp36m-macosx_10_9_x86_64.macosx_10_10_x86_64.whl
Collecting dash (from -r requirements.txt (line 5))
Collecting imbalanced_learn (from -r requirements.txt (line 6))
  Using cached https://files.pythonhosted.org/pac

In [95]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [90]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [92]:
import numpy as np
import pandas as pd

import gc
import pickle
from joblib import dump, load

from pathlib import Path

### load some project specific modules:

In [93]:
from sklearn.metrics import roc_auc_score

In [96]:
from data_transform_methods import *
from explainer_methods import *
from optimizer_methods import *
from explainer import *

# load data

In [97]:
d = pd.read_csv("train.csv")

### feature engineering:

In [98]:
d['Familysize'] = d.SibSp + d.Parch
d['Cabin'] = d.Cabin.str[0]

In [99]:
d.shape
d.head().T

(891, 13)

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [100]:
show_cardinality_of_cats(d)

Unnamed: 0,Column,Cardinality
0,Name,891
1,Ticket,681
2,Cabin,8
3,Embarked,3
4,Sex,2


# Transform data:

In [101]:
TARGET='Survived'

### drop name, ticket and id columns:

In [102]:
drop_columns=['Name', 'Ticket', 'PassengerId']

substring_drop_list = []

d = clean_data(d, drop_columns, substring_drop_list, drop_dates=True)

dropping Name
dropping Ticket
dropping PassengerId


## Generate training and test set:
- In this case we generate test set of 200 so that we'll have enoughd data points to calculate shap values, etc

In [103]:
test_idxs = d.sample(200).index
d_train = d[~d.index.isin(test_idxs)]
d_test = d[d.index.isin(test_idxs)]
d_train.shape, d_test.shape

((691, 10), (200, 10))

## Generate tree and linear datasets:

#### Cols to generate isolation forest outlier score for:

In [104]:
tree_transformer =  fit_transformer(d_train, target=TARGET, numfill='ExtremeValue')

Columns being transformed: 
numeric columns:  ['Parch', 'Pclass', 'Familysize', 'Fare', 'SibSp', 'Age']
categorical columns:  ['Sex', 'Cabin', 'Embarked']
fitting transformer...
Fit: DummyTransform for: Survived...
Fit: Filling numerical NaN Parch with                     ExtremeValue: -999...
Fit: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Fit: Filling numerical NaN Familysize with                     ExtremeValue: -999...
Fit: Filling numerical NaN Fare with                     ExtremeValue: -999...
Fit: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Fit: Filling numerical NaN Age with                     ExtremeValue: -999...
Fit: One-hot coding categorical variable Sex...
Fit: One-hot coding categorical variable Cabin...
Fit: One-hot coding categorical variable Embarked...


In [105]:
tree_data = (*get_transformed_X_y(d_train, tree_transformer, TARGET, add_random=False),
             *get_transformed_X_y(d_test, tree_transformer, TARGET, add_random=False))

Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Transform: Filling numerical NaN Familysize with                     ExtremeValue: -999...
Transform: Filling numerical NaN Fare with                     ExtremeValue: -999...
Transform: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Transform: Filling numerical NaN Age with                     ExtremeValue: -999...
Transform: One-hot coding categorical variable Sex...
Transform: One-hot coding categorical variable Cabin...
Transform: One-hot coding categorical variable Embarked...
Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Transform: Filling numerical NaN Familysize with     

In [106]:
(X_train, y_train, X_test, y_test) = tree_data

In [107]:
X_train.head()

Unnamed: 0,Parch,Pclass,Familysize,Fare,SibSp,Age,Sex_female,Sex_male,Sex_nan,Cabin_A,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,0,3,1,7.25,1,22.0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,1,1,71.2833,1,38.0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0,3,0,7.925,0,26.0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,1,1,53.1,1,35.0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0,3,0,8.05,0,35.0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0


In [108]:
len(X_train), y_train.sum(), y_train.mean()
len(X_test), y_test.sum(), y_test.mean()

(691, 264, 0.38205499276411)

(200, 78, 0.39)

## Run the garbage collector:

In [109]:
gc.collect()

44125

# Optimize hyperparameters

### optimize model:

In [110]:
#models = ['RandomForestClassifier', 'BalancedRandomForestClassifier', 
#            'XGBClassifier', 'LogisticRegression']
models = ['RandomForestClassifier']
best_model, trials = classifier_optimize(tree_data, None, models, 
                                         roc_auc_score, needs_proba=True, n_evals=200, cv=5)

100%|██████████| 200/200 [04:54<00:00,  1.70s/it, best loss: -0.8601603168016455]


In [111]:
best_model

{'class_weight': 'balanced',
 'max_depth': None,
 'max_features': 0.5556469454895402,
 'min_impurity_decrease': 6.755770490218343e-09,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'model_type': 'RandomForestClassifier',
 'n_estimators': 90}

In [112]:
model, (X_train, y_train, X_test, y_test), transformer = get_best_model_and_data(best_model, tree_data, 
                                            None, None, tree_transformer, None)
model.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None,
                       max_features=0.5556469454895402, max_leaf_nodes=None,
                       min_impurity_decrease=6.755770490218343e-09,
                       min_impurity_split=None, min_samples_leaf=5,
                       min_samples_split=10, min_weight_fraction_leaf=0.0,
                       n_estimators=90, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [113]:
pred_probas = model.predict_proba(X_test)
print('roc auc score:', roc_auc_score(y_test, pred_probas[:,1]))
print(classification_report(y_test, np.where(pred_probas[:,1]>0.5, 1, 0)))

roc auc score: 0.8878730559058428
              precision    recall  f1-score   support

           0       0.84      0.89      0.87       122
           1       0.81      0.73      0.77        78

    accuracy                           0.83       200
   macro avg       0.83      0.81      0.82       200
weighted avg       0.83      0.83      0.83       200



In [114]:
model_bunch = ModelBunch(model, transformer, TARGET, use_columns=X_train.columns)
pickle.dump(model_bunch, open(Path.cwd() / 'titanic_model_bunch.pkl','wb'))

In [115]:
model_bunch.predict_proba(d_test)

Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Transform: Filling numerical NaN Familysize with                     ExtremeValue: -999...
Transform: Filling numerical NaN Fare with                     ExtremeValue: -999...
Transform: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Transform: Filling numerical NaN Age with                     ExtremeValue: -999...
Transform: One-hot coding categorical variable Sex...
Transform: One-hot coding categorical variable Cabin...
Transform: One-hot coding categorical variable Embarked...


array([[0.8800952 , 0.1199048 ],
       [0.90030955, 0.09969045],
       [0.44246396, 0.55753604],
       [0.64331488, 0.35668512],
       [0.9566519 , 0.0433481 ],
       [0.92935791, 0.07064209],
       [0.81030006, 0.18969994],
       [0.23465957, 0.76534043],
       [0.93537248, 0.06462752],
       [0.01075447, 0.98924553],
       [0.65503503, 0.34496497],
       [0.10857424, 0.89142576],
       [0.68582486, 0.31417514],
       [0.        , 1.        ],
       [0.61800031, 0.38199969],
       [0.84291075, 0.15708925],
       [0.8125738 , 0.1874262 ],
       [0.9566519 , 0.0433481 ],
       [0.69738046, 0.30261954],
       [0.90382614, 0.09617386],
       [0.83168636, 0.16831364],
       [0.89528107, 0.10471893],
       [0.95028317, 0.04971683],
       [0.26078432, 0.73921568],
       [0.69177043, 0.30822957],
       [0.92957173, 0.07042827],
       [0.12249991, 0.87750009],
       [0.90601945, 0.09398055],
       [0.63232603, 0.36767397],
       [0.80274704, 0.19725296],
       [0.

In [116]:
explainer = TreeClassifierExplainer(model_bunch, d_test, 
                                    metric=roc_auc_score, labels=['Not Survived', 'Survived'])

Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Transform: Filling numerical NaN Familysize with                     ExtremeValue: -999...
Transform: Filling numerical NaN Fare with                     ExtremeValue: -999...
Transform: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Transform: Filling numerical NaN Age with                     ExtremeValue: -999...
Transform: One-hot coding categorical variable Sex...
Transform: One-hot coding categorical variable Cabin...
Transform: One-hot coding categorical variable Embarked...


In [117]:
explainer.calculate_properties()

Calculating predictions...
Calculating prediction probabilities...
Calculating importances...
Generating shap TreeExplainer...
Calculating shap values...
Calculating shap interaction values...
Generating shadow trees...
Calculating categorical shap interaction values...


In [118]:
explainer.contrib_df(index=41)

Unnamed: 0,col,contribution,cumulative,base,raw_value
0,base_value,0.499315,0.499315,0.0,
1,Sex,-0.196643,0.302672,0.499315,male
2,Cabin,0.088149,0.390821,0.302672,F
3,Familysize,0.057917,0.448737,0.390821,2
4,Age,0.052732,0.50147,0.448737,3
5,Fare,-0.050491,0.450979,0.50147,26
6,Parch,0.046548,0.497526,0.450979,1
7,Pclass,0.036108,0.533635,0.497526,2
8,Embarked,-0.014118,0.519517,0.533635,S
9,SibSp,0.003079,0.522596,0.519517,1


In [119]:
explainer.plot_confusion_matrix()

In [120]:
explainer.plot_precision()

In [122]:
dump(explainer, Path.cwd() / 'titanic_explainer.joblib')

In [None]:
! python app.py

Preparing to start dash app ...
Loading libraries...
loading DataExplainer object...
Loading Dash...
Defining layout...
Starting server...
 * Serving Flask app "app" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off
 * Running on http://127.0.0.1:8073/ (Press CTRL+C to quit)
127.0.0.1 - - [30/Aug/2019 14:37:27] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [30/Aug/2019 14:37:27] "GET /assets/stylesheet.css?m=1566133040.0 HTTP/1.1" 200 -
127.0.0.1 - - [30/Aug/2019 14:37:27] "GET /_dash-component-suites/dash_renderer/react@16.8.6.min.js?v=1.0.0&m=1566138683 HTTP/1.1" 200 -
127.0.0.1 - - [30/Aug/2019 14:37:27] "GET /_dash-component-suites/dash_renderer/prop-types@15.7.2.min.js?v=1.0.0&m=1566138683 HTTP/1.1" 200 -
127.0.0.1 - - [30/Aug/2019 14:37:27] "GET /_dash-component-suites/dash_renderer/react-dom@16.8.6.min.js?v=1.0.0&m=1566138683 HTTP/1.1" 200 -
127.0.0.1 - - [30/Aug/2019 14:37:27] "GET /assets/bootstrap.css?m=1566133040.0 HTTP/1.1"

In [87]:
!git add .
!git commit -m "added categorical shap values"
!git push -u origin master

[master fc79038] workaround for _shap_base_value bug
 4 files changed, 471 insertions(+), 220 deletions(-)
 create mode 100644 build-titanic-model.py
Enumerating objects: 10, done.
Counting objects: 100% (10/10), done.
Delta compression using up to 4 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 532.09 KiB | 888.00 KiB/s, done.
Total 6 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/oegedijk/explainingtitanic.git
   efa9682..fc79038  master -> master
Branch 'master' set up to track remote branch 'master' from 'origin'.
