In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!which python

# 設定

In [None]:
import os
import sys
import datetime
import pickle
import json

import japanize_matplotlib #日本語化matplotlib
import numpy as np
import pandas as pd
import geopandas as gpd
import torch
import pytorch_lightning as pl

import main

sys.path.append(os.path.join(os.pardir, 'data_processing'))
import transform_data as t

In [None]:
HOME_PATH = os.pardir

RESULT_PATH = os.path.join(HOME_PATH, 'output')
MODEL_RESULT_PATH = os.path.join(RESULT_PATH, 'model')
DATA_RESULT_PATH = os.path.join(RESULT_PATH, 'data')
MODEL_FILE = os.path.join(MODEL_RESULT_PATH, 'model.pth')
FULL_RESULT_FILE = os.path.join(DATA_RESULT_PATH, 'full_result.csv')
PREDICTION_RESULT_FILE = os.path.join(DATA_RESULT_PATH, 'prediction_result.csv')

INPUT_PATH = os.path.join(HOME_PATH, 'input', 'preprocessed')                      
INPUT_MODELING_FILE = os.path.join(INPUT_PATH, 'modeling.pkl')                     # 機械学習インプットファイル
INPUT_SUBMISSION_FILE = os.path.join(INPUT_PATH, 'submission.pkl')                 # 機械学習インプットファイル
DATA_PROFILE_FILE = os.path.join(INPUT_PATH, 'data_profile.json')
DATA_PROFILE = json.load(open(DATA_PROFILE_FILE))
SCALER_FILE = os.path.join(INPUT_PATH, 'scaler.pkl')

In [None]:
model = main.MyLitModule()
model.setup()
model.load_state_dict(torch.load(MODEL_FILE))

TARGET_COL = DATA_PROFILE['target']['name']
PRED_COL = TARGET_COL + '_pred'

In [None]:
df_full = pd.read_csv(FULL_RESULT_FILE)
df_pred = pd.read_csv(PREDICTION_RESULT_FILE)

df_both, df_orig_modeling, df_orig_submission = t.load_data()

df_input_modeling = pd.read_pickle(INPUT_MODELING_FILE)
df_input_submission = pd.read_pickle(INPUT_SUBMISSION_FILE)

In [None]:
for df in [df_full, df_pred, df_orig_modeling, df_orig_submission, df_input_modeling, df_input_submission]:
    print(len(df))

In [None]:
df_pred.head()

In [None]:
df_orig_submission.head()

In [None]:
df_orig_master = pd.merge(df_orig_modeling.drop(TARGET_COL, axis=1), df_full, how='inner', left_index=True, right_index=True)
df_orig_master

In [None]:
# df_pred_master = pd.merge(df_orig_submission.drop(TARGET_COL, axis=1), df_pred, how='inner', left_index=True, right_index=True)
df_pred_master = pd.merge(df_orig_submission, df_pred, how='inner', left_index=True, right_index=True)
df_pred_master

# Shap

参考：[SHapley Additive exPlanationsで機械学習モデルを解釈する](https://speakerdeck.com/dropout009/shapley-additive-explanationsdeji-jie-xue-xi-moderuwojie-shi-suru)

In [None]:
import shap

In [None]:
df_shap_full = df_full.drop([TARGET_COL, PRED_COL, 'data_usage'], axis=1)
arr_shap_full = df_shap_full.values.astype(np.float32)
df_shap_full

In [None]:
%%time
# CPU times: user 3.35 ms, sys: 1.94 ms, total: 5.3 ms
# Wall time: 3.93 ms
explainer = shap.DeepExplainer(model, torch.from_numpy(arr_shap_full).to('cpu'))

In [None]:
%%time
# CPU times: user 26.7 s, sys: 3.93 s, total: 30.6 s
# Wall time: 22 s
# n=300
df_shap_sample = df_shap_full.sample(n=300)
arr_shap_sample = df_shap_sample.values.astype(np.float32)
features = df_shap_sample.columns.tolist()

In [None]:
shap_values = explainer.shap_values(torch.from_numpy(arr_shap_sample).to('cpu'))
base_value = explainer.expected_value[0]
shap.summary_plot(shap_values, df_shap_sample)

In [None]:
df_shap_values = pd.DataFrame(shap_values, columns=features)

In [None]:
shap.force_plot(
    base_value=base_value,
    shap_values=shap_values[0],
    features=features,
    link='logit',
    matplotlib=True
)

In [None]:
target_indicator = 'Age'

shap.dependence_plot(
    ind=target_indicator,
    shap_values=shap_values,
    features=df_shap_sample,
    interaction_index=None
)

In [None]:
shap.decision_plot(
    base_value=base_value, 
    shap_values=shap_values[:100,:], 
    features=features,
    link="logit",
    show=True
)

In [None]:
df_shap = pd.DataFrame({
    "name": df_shap_values.columns,
    "mean_abs_shap": np.mean(np.abs(shap_values), axis=0),
    "stdev_abs_shap": np.std(np.abs(shap_values), axis=0),
})
df_shap.sort_values("mean_abs_shap", ascending=False)[:20]

# 個別データ

In [None]:
import altair as alt

# Optional in JupyterLab: requires an up-to-date vega labextension.
alt.renderers.enable('mimetype')

# 上限エラー無効化
alt.data_transformers.enable('default', max_rows=None)