# Imports

In [17]:
!pip install -U --ignore-installed -r requirements.txt

Collecting dash_core_components (from -r requirements.txt (line 1))
Collecting sklearn_pandas (from -r requirements.txt (line 2))
  Using cached https://files.pythonhosted.org/packages/1f/48/4e1461d828baf41d609efaa720d20090ac6ec346b5daad3c88e243e2207e/sklearn_pandas-1.8.0-py2.py3-none-any.whl
Collecting numpy (from -r requirements.txt (line 3))
  Using cached https://files.pythonhosted.org/packages/be/e8/45079ae05c4dda4a67bc51578ae5e75feda0a79c2836d477d676e7a58efb/numpy-1.17.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl
Collecting pandas (from -r requirements.txt (line 4))
  Using cached https://files.pythonhosted.org/packages/94/f0/3099fdb1ae94663561cd695b820f05b6f6d240c919ba179c076015de5e37/pandas-0.25.0-cp36-cp36m-macosx_10_9_x86_64.macosx_10_10_x86_64.whl
Collecting dash (from -r requirements.txt (line 5))
Collecting imbalanced_learn (from -r requirements.txt (line 6))
  Using cached https://files.pythonhosted.org/pac

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [4]:
import numpy as np
import pandas as pd

import gc
import pickle

from pathlib import Path

### load some project specific modules:

In [27]:
from sklearn.metrics import roc_auc_score

In [33]:
from data_transform_methods import *
from explainer_methods import *
from optimizer_methods import *
from explainer import *

# load data

In [7]:
d = pd.read_csv("train.csv")

In [8]:
d['Familysize'] = d.SibSp + d.Parch
d['Cabin'] = d.Cabin.str[0]

In [9]:
d.shape
d.head().T

(891, 13)

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [10]:
show_cardinality_of_cats(d)

Unnamed: 0,Column,Cardinality
0,Name,891
1,Ticket,681
2,Cabin,8
3,Embarked,3
4,Sex,2


# Transform data:

In [11]:
TARGET='Survived'

### drop date and indicator columns:

In [13]:
drop_columns=['Name', 'Ticket', 'PassengerId']

substring_drop_list = []

d = clean_data(d, drop_columns, substring_drop_list, drop_dates=True)

Name no longer in d
Ticket no longer in d
PassengerId no longer in d


In [15]:
test_idxs = d.sample(200).index
d_train = d[~d.index.isin(test_idxs)]
d_test = d[d.index.isin(test_idxs)]
d_train.shape, d_test.shape

((691, 10), (200, 10))

## Generate tree and linear datasets:

#### Cols to generate isolation forest outlier score for:

In [16]:
tree_transformer =  fit_transformer(d_train, target=TARGET, numfill='ExtremeValue')

Columns being transformed: 
numeric columns:  ['Parch', 'SibSp', 'Pclass', 'Age', 'Familysize', 'Fare']
categorical columns:  Index(['Sex', 'Cabin', 'Embarked'], dtype='object')
[('Survived', [<data_transform_methods.DummyTransform object at 0x127127048>], {'input_df': True}), ('Parch', [<data_transform_methods.NumericFill object at 0x127127588>], {'input_df': True}), ('SibSp', [<data_transform_methods.NumericFill object at 0x127127be0>], {'input_df': True}), ('Pclass', [<data_transform_methods.NumericFill object at 0x127127550>], {'input_df': True}), ('Age', [<data_transform_methods.NumericFill object at 0x1271270b8>], {'input_df': True}), ('Familysize', [<data_transform_methods.NumericFill object at 0x1271270f0>], {'input_df': True}), ('Fare', [<data_transform_methods.NumericFill object at 0x127127198>], {'input_df': True}), ('Sex', [<data_transform_methods.OneHot object at 0x127157748>], {'input_df': True}), ('Cabin', [<data_transform_methods.OneHot object at 0x1271716a0>], {'input_

In [17]:
tree_data = (*get_transformed_X_y(d_train, tree_transformer, TARGET, add_random=False),
             *get_transformed_X_y(d_test, tree_transformer, TARGET, add_random=False))

Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Transform: Filling numerical NaN Age with                     ExtremeValue: -999...
Transform: Filling numerical NaN Familysize with                     ExtremeValue: -999...
Transform: Filling numerical NaN Fare with                     ExtremeValue: -999...
Transform: One-hot coding categorical variable Sex...
Transform: One-hot coding categorical variable Cabin...
Transform: One-hot coding categorical variable Embarked...
Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with          

In [18]:
(X_train, y_train, X_test, y_test) = tree_data

In [19]:
X_train.head()

Unnamed: 0,Parch,SibSp,Pclass,Age,Familysize,Fare,Sex_female,Sex_male,Sex_nan,Cabin_A,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,0,1,3,22.0,1,7.25,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,1,1,38.0,1,71.2833,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,3,26.0,0,7.925,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,1,1,35.0,1,53.1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,3,-999.0,0,8.4583,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0


In [20]:
len(X_train), y_train.sum(), y_train.mean()
len(X_test), y_test.sum(), y_test.mean()

(691, 261, 0.37771345875542695)

(200, 81, 0.405)

## Run the garbage collector:

In [21]:
gc.collect()

230

# Optimize hyperparameters

### optimize model:

In [23]:
#models = ['RandomForestClassifier', 'BalancedRandomForestClassifier', 'LogisticRegression']
models = ['RandomForestClassifier']
best_model, trials = classifier_optimize(tree_data, None, models, 
                                         roc_auc_score, needs_proba=True, n_evals=200, cv=5)

100%|██████████| 200/200 [03:48<00:00,  1.13s/it, best loss: -0.860665946602761] 


In [24]:
best_model

{'class_weight': 'balanced',
 'max_depth': None,
 'max_features': 0.28195456631768984,
 'min_impurity_decrease': 4.859996740345185e-07,
 'min_samples_leaf': 7,
 'min_samples_split': 11,
 'model_type': 'RandomForestClassifier',
 'n_estimators': 100}

In [25]:
model, (X_train, y_train, X_test, y_test), transformer = get_best_model_and_data(best_model, tree_data, 
                                            None, None, tree_transformer, None)
model.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None,
                       max_features=0.28195456631768984, max_leaf_nodes=None,
                       min_impurity_decrease=4.859996740345185e-07,
                       min_impurity_split=None, min_samples_leaf=7,
                       min_samples_split=11, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [28]:
pred_probas = model.predict_proba(X_test)
roc_auc_score(y_test, pred_probas[:,1])
topx_perc_precision_score(y_test, pred_probas[:,1])
report = classification_report(y_test, np.where(pred_probas[:,1]>0.5, 1, 0))
print(report)

0.8828197945845004

1.0

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       119
           1       0.78      0.77      0.77        81

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.81       200
weighted avg       0.81      0.81      0.81       200



In [29]:
model_bunch = ModelBunch(model, transformer, TARGET, X_train.columns)
pickle.dump(model_bunch, open(Path.cwd() / 'titanic_model_bunch.pkl','wb'))

In [34]:
explainer = TreeExplainer(model_bunch, d_test, roc_auc_score)

Transform: DummyTransform for: Survived...
Transform: Filling numerical NaN Parch with                     ExtremeValue: -999...
Transform: Filling numerical NaN SibSp with                     ExtremeValue: -999...
Transform: Filling numerical NaN Pclass with                     ExtremeValue: -999...
Transform: Filling numerical NaN Age with                     ExtremeValue: -999...
Transform: Filling numerical NaN Familysize with                     ExtremeValue: -999...
Transform: Filling numerical NaN Fare with                     ExtremeValue: -999...
Transform: One-hot coding categorical variable Sex...
Transform: One-hot coding categorical variable Cabin...
Transform: One-hot coding categorical variable Embarked...


In [35]:
explainer.shap_explainer.expected_value

0.5022135376930237

In [36]:
explainer.contrib_df(0)

Unnamed: 0,col,contribution,cumulative,base,raw_value
0,base_value,0.502214,0.502214,0.0,
1,Sex,-0.169627,0.332587,0.502214,male
2,Pclass,-0.040045,0.292542,0.332587,3
3,Fare,-0.03488,0.257662,0.292542,8.05
4,Cabin,-0.033602,0.22406,0.257662,
5,Age,-0.014883,0.209177,0.22406,35
6,Embarked,-0.012191,0.196986,0.209177,S
7,SibSp,0.005473,0.202459,0.196986,0
8,Familysize,0.003996,0.206455,0.202459,0
9,Parch,-0.002468,0.203987,0.206455,0


In [40]:
explainer.importances, explainer.shap_values, explainer.shap_interaction_values, explainer.shadow_trees

(              Importance
 Feature                 
 Sex_female      0.067123
 Sex_male        0.055192
 Pclass          0.036778
 Age             0.017429
 Familysize      0.011723
 Fare            0.005084
 Cabin_nan       0.005032
 Cabin_E         0.000726
 Parch           0.000156
 Cabin_D         0.000052
 Cabin_T         0.000000
 Cabin_G         0.000000
 Cabin_F         0.000000
 Embarked_nan    0.000000
 SibSp           0.000000
 Sex_nan         0.000000
 Cabin_A        -0.000104
 Embarked_Q     -0.000415
 Embarked_C     -0.001141
 Cabin_C        -0.001141
 Embarked_S     -0.002127
 Cabin_B        -0.002179,
 array([[-0.00246776,  0.00547334, -0.04004498, ..., -0.00040493,
         -0.00582164,  0.        ],
        [-0.00170247, -0.00282462,  0.05776043, ...,  0.00016502,
         -0.00949886,  0.        ],
        [ 0.01318384,  0.00417146, -0.08017494, ..., -0.00359574,
         -0.01637067,  0.        ],
        ...,
        [ 0.01225074,  0.01331451,  0.1047656 , ..., -0.

In [41]:
pickle.dump(explainer, open(Path.cwd() / 'titanic_explainer.pkl', 'wb'))

In [42]:
! python app.py

Preparing to start dash app ...
Loading libraries...
loading DataExplainer object...
Loading Dash...
Defining layout...
Starting server...
 * Serving Flask app "app" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off
 * Running on http://127.0.0.1:8060/ (Press CTRL+C to quit)
127.0.0.1 - - [18/Aug/2019 16:45:02] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [18/Aug/2019 16:45:02] "GET /_dash-component-suites/dash_renderer/react-dom@16.8.6.min.js?v=1.0.0&m=1566138683 HTTP/1.1" 200 -
127.0.0.1 - - [18/Aug/2019 16:45:02] "GET /_dash-component-suites/dash_renderer/react@16.8.6.min.js?v=1.0.0&m=1566138683 HTTP/1.1" 200 -
127.0.0.1 - - [18/Aug/2019 16:45:02] "GET /_dash-component-suites/dash_renderer/prop-types@15.7.2.min.js?v=1.0.0&m=1566138683 HTTP/1.1" 200 -
127.0.0.1 - - [18/Aug/2019 16:45:02] "GET /_dash-component-suites/dash_core_components/highlight.pack.js?v=1.1.1&m=1566138668 HTTP/1.1" 200 -
127.0.0.1 - - [18/Aug/2019 16:45:02] "GET 