# Libraries

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as ipy
from random import sample
import shap
shap.initjs()
%matplotlib inline
plt.style.use('seaborn-whitegrid')

# Data

In [None]:
solution_path = 'solution_4_tuning_800'

application = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/files/unzipped_data/application_train.csv')
oof_train = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/{}/lightGBM_out_of_fold_train_predictions.csv'.format(solution_path))
model = joblib.load('/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/{}/transformers//light_gbm_fold_0'.format(solution_path))
features = joblib.load('/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/{}/outputs/feature_joiner_valid_fold_0'.format(solution_path))
description = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/data/HomeCredit_columns_description.csv', encoding='latin1')

In [None]:
index_list = oof_train[oof_train.fold_id==0]['SK_ID_CURR']

In [None]:
oof_train_0 = oof_train[oof_train['SK_ID_CURR'].isin(index_list)]
application_0 = application[application['SK_ID_CURR'].isin(index_list)]

In [None]:
oof_train_0 = oof_train[oof_train['SK_ID_CURR'].isin(index_list)]
application_0 = application[application['SK_ID_CURR'].isin(index_list)]
features_df = features['features']

In [None]:
exploration_data = oof_train_0.copy()
exploration_data['target'] = application_0['TARGET'].values
exploration_data['diff_abs'] = np.abs(exploration_data['lightGBM_prediction'] - exploration_data['target'])
exploration_data['diff'] = exploration_data['lightGBM_prediction'] - exploration_data['target']

# Features description

##### Description of features(Only from kaggle)

In [None]:
@ipy.interact(
    search = ipy.Text(
        description='Search',
    )
)
def gunc(search):
    if len(search) > 0:
        mask = list(map(lambda x: x.find(search.lower())!=-1, description['Row'].str.lower().values))
        rows = description[mask]['Row']
        if len(rows) == 0:
            rows = description.Row
            selected = rows[0]
        if len(rows) > 1:
            selected = rows.values[0]
        elif len(rows) == 1:
            selected = rows.item()
    else:
        rows = description.Row
        selected = rows[0]
    @ipy.interact(
            cols = ipy.SelectMultiple(
                options=rows,
                rows=10,
                value=(selected,),
                description='Features',
                layout=ipy.Layout(width='90%')
            )
    )
    def func(cols):
        for i, col in enumerate(cols):
            display('{} --- {}'.format(col, description[description.Row==col]['Description'].values[0]))

# Shap - Feature impact on a model

https://github.com/slundberg/shap 

https://arxiv.org/pdf/1802.03888.pdf 

http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf 

Our first step is to compute SHAP values for each example in our dataset. $base\_value$ is mean of our predictions and in our dataset equals $0.04942$ and will be flagged on plots below. The Shap value tells us how certain feature moved our prediction on ceratin example from expected value of all predictions.
$$ output\_value (x) = base\_value + \sum\limits_{i=1}^{M} \phi_{i}z_{i}(x) $$,

where $ z_i(x) \in \{0, 1\}$ describes if $i$-th feature-value(e.g. SEX='Male') occurs at example $x$ and $\phi_i$ is SHAP value of given feature.

In [None]:
shap_values = shap.TreeExplainer(model).shap_values(features_df)
global_shap_vals = np.abs(shap_values).mean(0)[:-1]
inds = np.argsort(global_shap_vals)

In [None]:
display(shap.force_plot(shap_values[0,:], features_df.iloc[0,:], link="logit"))
display(shap.force_plot(shap_values[1,:], features_df.iloc[1,:], link="logit"))

# Feature importance

##### Our metric will be mean of SHAP's absolute values, which tell how much each feature is moving up or down predictions of our model from $base\_value$. Features are sorted from most to least important.

In [None]:
@ipy.interact(cols=ipy.IntRangeSlider(
                value=(1, 20),
                min=1,
                max=features_df.shape[1],
                description='Features:',
                continuous_update=False,
                layout=ipy.Layout(width='90%', height='30px')
            ))
def func(cols):
    min_index = -cols[1]
    max_index = -cols[0]
    y_pos = np.arange(features_df.shape[1])
    plt.title("Feature importance: mean(|SHAP|)")
    plt.barh(y_pos[min_index:max_index], global_shap_vals[inds][min_index:max_index], color="#1E88E5")
    plt.yticks(y_pos[min_index:max_index], features_df.columns[inds][min_index:max_index])
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.xlabel("mean SHAP value magnitude (change in log odds)")
    plt.gcf().set_size_inches(11, (cols[1]-cols[0])//2)
    plt.show()

##### The plot below describes how top-20 features are changing the model predictions in dependence of their values. 
    * On x-axis there are SHAP values
    * Each row is corresponding to some feature
    * Each point is corresponding to some example in dataset
    * Each point has color corresponding to him feature value(HIGH ~ Red, LOW ~ Blue)

In [None]:
display(shap.summary_plot(shap_values, features_df, max_display=20))

##### Relation between SHAP and feature values and coloring with values of another correlated feature.

In [None]:
@ipy.interact(
    search = ipy.Text(
        description='Search',
    )
)
def gunc(search):
    lst = list(reversed(features_df.columns[inds]))
    
    if len(search) > 0:
        mask = list(map(lambda x: x.find(search.lower())!=-1, map(lambda x: x.lower(), lst)))
        rows = [item for i, item in enumerate(lst) if mask[i]]
        if len(rows) == 0:
            rows = description.Row
            selected = rows[0]
        selected = rows[0]
    else:
        rows = lst
        selected = lst[0]
    @ipy.interact(
                cols = ipy.SelectMultiple(
                    options=rows,
                    rows=10,
                    value=(selected, ),
                    description='Corr Columns',
                    layout=ipy.Layout(width='90%')
                ),
                num_samples=ipy.IntSlider(
                    value=250,
                    min=100,
                    max=1000,
                    step = 50,
                    continuous_update=False,
                    description='Samples:',
                    layout=ipy.Layout(width='90%', height='30px')
                )
    )
    def func(cols, num_samples):
        smp = sample(range(len(shap_values)), num_samples)
        for col in cols:
            display(shap.dependence_plot(col, shap_values[smp], features_df.loc[smp,:]))

# Predictions

In [None]:
plt.figure(figsize=(17,5))

plt.title("Distribution of |Difference between predictions and target|")
sns.distplot(exploration_data[exploration_data['target']==0]['diff_abs'], 
             label='Target_0', 
             color='#1587E8',
             hist_kws={'alpha': 0.8},
             bins=100);
sns.distplot(exploration_data[exploration_data['target']==1]['diff_abs'], 
             label='Target_1',
             color='#F02958',
             hist_kws={'alpha': 0.8},
             bins=100);
plt.legend()
plt.show()

In [None]:
feature_analysis = features_df.copy()

In [None]:
feature_analysis['INDEX'] = index_list
feature_analysis['TARGET'] = exploration_data['target']
feature_analysis['DIFF'] = exploration_data['diff']
feature_analysis['DIFF_ABS'] = exploration_data['diff_abs']

##### Distributions of choosen features in dependence of target value

In [None]:
@ipy.interact(
    search = ipy.Text(
        description='Search',
    )
)
def gunc(search):
    lst = list(reversed(features_df.columns[inds]))
    
    if len(search) > 0:
        mask = list(map(lambda x: x.find(search.lower())!=-1, map(lambda x: x.lower(), lst)))
        rows = [item for i, item in enumerate(lst) if mask[i]]
        if len(rows) == 0:
            rows = description.Row
            selected = rows[0]
        selected = rows[0]
    else:
        rows = lst
        selected = lst[0]
    @ipy.interact(
                cols = ipy.SelectMultiple(
                    options=rows,
                    rows=10,
                    value=(selected, ),
                    description='Corr Columns',
                    layout=ipy.Layout(width='90%')
                ),
                num_samples=ipy.IntSlider(
                    value=250,
                    min=100,
                    max=1000,
                    step = 50,
                    continuous_update=False,
                    description='Samples:',
                    layout=ipy.Layout(width='90%', height='30px')
                )
    )
    def func(cols, num_samples):
        for col in cols:
            fig = plt.figure(figsize=(16, 8));
            target_1 = feature_analysis[feature_analysis['TARGET']==1]
            target_0 = feature_analysis[feature_analysis['TARGET']==0]
            smp_1 = sample(range(len(target_1)), min(num_samples, len(target_1)))
            smp_0 = sample(range(len(target_0)), min(num_samples, len(target_0)))
            df = pd.concat([target_1.iloc[smp_1,:], target_0.iloc[smp_0,:]], axis=0)
            display(sns.swarmplot(x='TARGET', y=col, data=df, palette=['#1587E8', '#F02958'],))
            plt.show()

##### Distributions of choosen features in dependence of absolute difference between predictions and target

In [None]:
def get_between(df, col, interval):
    return df[(interval[0] <= df[col]) & (df[col] <= interval[1])]

def col_without_nan(df, col):
    val = df[col]
    return val[~np.isnan(val)]

@ipy.interact(
    search = ipy.Text(
        description='Search',
        continuous_update=False
    )
)
def gunc(search):
    lst = list(reversed(features_df.columns[inds]))
    
    if len(search) > 0:
        mask = list(map(lambda x: x.find(search.lower())!=-1, map(lambda x: x.lower(), lst)))
        rows = [item for i, item in enumerate(lst) if mask[i]]
        if len(rows) == 0:
            rows = description.Row
            selected = rows[0]
        selected = rows[0]
    else:
        rows = lst
        selected = lst[0]
    @ipy.interact(
                diff_1=ipy.FloatRangeSlider(
                    value=(0.5, 1.0),
                    min=0.0,
                    max=1.0,
                    step=0.01,
                    description='Difference_1:',
                    continuous_update=False,
                    layout=ipy.Layout(width='90%', height='30px')
                ),  
                diff_2=ipy.FloatRangeSlider(
                    value=(0.0, 0.5),
                    min=0.0,
                    max=1.0,
                    step=0.01,
                    description='Difference_2:',
                    continuous_update=False,
                    layout=ipy.Layout(width='90%', height='30px')
                ),
                cols = ipy.SelectMultiple(
                    options=rows,
                    rows=10,
                    value=(selected, ),
                    description='Columns',
                    layout=ipy.Layout(width='90%')
                ))
    def func(diff_1, diff_2, cols):
        for col in cols:
            vals_1 = col_without_nan(get_between(feature_analysis, 'DIFF_ABS', diff_1) , col)
            vals_2 = col_without_nan(get_between(feature_analysis, 'DIFF_ABS', diff_2) , col)
            display(sns.distplot(vals_1, label='Difference_1', color='#F02958', hist_kws={'alpha': 0.7}, bins=min(100, len(vals_1))))
            display(sns.distplot(vals_2, label='Difference_2', color='#1587E8', hist_kws={'alpha': 0.7}, bins=min(100, len(vals_2))))
            plt.legend()
            plt.show()