# setteing

In [1]:
%matplotlib inline 

import gc
import json
import requests
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import lightgbm as lgb
import optuna

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import learning_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

from sklearn.inspection import partial_dependence
import shap

# dataset

In [2]:
df_transaction_train = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")
df_identity_train = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
df_identity_train["specifyIdentity"] = 1
df_train = pd.merge(df_transaction_train, df_identity_train, how='left', on="TransactionID")
df_train["isTest"] = 0
df_train["specifyIdentity"] = df_train["specifyIdentity"].fillna(0) 
df_train.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32,specifyIdentity,isTest
count,590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,...,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0,590540.0,590540.0
mean,3282270.0,0.03499,7372311.0,135.027176,9898.734658,362.555488,153.194925,199.278897,290.733794,86.80063,...,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597,0.244239,0.0
std,170474.4,0.183755,4617224.0,239.162522,4901.170153,157.793246,11.336444,41.244453,101.741072,2.690623,...,141.095343,152.160327,198.847038,6.897665,2.372447,97.461089,32.101995,3.737502,0.429636,0.0
min,2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0,0.0,0.0
25%,3134635.0,0.0,3027058.0,43.321,6019.0,214.0,150.0,166.0,204.0,87.0,...,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0,0.0,0.0
50%,3282270.0,0.0,7306528.0,68.769,9678.0,361.0,150.0,226.0,299.0,87.0,...,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0,0.0,0.0
75%,3429904.0,0.0,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,...,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0,0.0,0.0
max,3577539.0,1.0,15811130.0,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,...,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0,1.0,0.0


In [3]:
df_transaction_test = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")
df_identity_test = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
df_identity_test["specifyIdentity"] = 1
df_test = pd.merge(df_transaction_test, df_identity_test, how='left', on="TransactionID")
df_test["isTest"] = 1
df_test["specifyIdentity"] = df_test["specifyIdentity"].fillna(0)
df_test.describe()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32,specifyIdentity,isTest
count,506691.0,506691.0,506691.0,506691.0,498037.0,503689.0,502144.0,441082.0,441082.0,215474.0,...,135906.0,135633.0,5059.0,5062.0,4740.0,5039.0,5047.0,70671.0,506691.0,506691.0
mean,3916894.0,26929940.0,134.725568,9957.222175,363.735379,153.543409,200.162975,291.846514,86.723412,87.06527,...,350.122982,408.88623,507.727021,15.336823,13.166667,332.043064,152.752923,26.217939,0.280066,1.0
std,146269.2,4756507.0,245.779822,4884.960969,158.688653,12.443013,40.562461,102.06273,2.987328,314.131694,...,139.140824,158.971756,227.371061,5.618032,3.22244,86.356683,31.916995,3.601046,0.449032,0.0
min,3663549.0,18403220.0,0.018,1001.0,100.0,100.0,100.0,100.0,10.0,0.0,...,100.0,100.0,100.0,11.0,10.0,100.0,100.0,8.0,0.0,1.0
25%,3790222.0,22771540.0,40.0,6019.0,207.0,150.0,166.0,204.0,87.0,3.0,...,266.0,256.0,252.0,14.0,11.0,321.0,137.0,24.0,0.0,1.0
50%,3916894.0,27204660.0,67.95,9803.0,369.0,150.0,226.0,299.0,87.0,8.0,...,321.0,484.0,576.0,14.0,11.0,321.0,147.0,24.0,0.0,1.0
75%,4043566.0,31348560.0,125.0,14276.0,512.0,150.0,226.0,330.0,87.0,20.0,...,427.0,549.0,711.0,14.0,15.0,355.0,182.0,32.0,1.0,1.0
max,4170239.0,34214340.0,10270.0,18397.0,600.0,232.0,237.0,540.0,102.0,8081.0,...,670.0,660.0,854.0,44.0,26.0,549.0,216.0,48.0,1.0,1.0


In [4]:
del df_transaction_train
del df_identity_train
del df_transaction_test
del df_identity_test

gc.collect()

53

In [5]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df.describe()


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_20,id_21,id_22,id_24,id_25,id_26,id_32,isFraud,isTest,specifyIdentity
count,1097228.0,1097228.0,1097228.0,1097228.0,1092483.0,1097228.0,1097228.0,1097228.0,1097228.0,1097228.0,...,274894.0,10218.0,10231.0,9487.0,10171.0,10210.0,148257.0,590540.0,1097231.0,1097231.0
mean,12.24565,3.656317,8.968402,3.417374,30.36952,7.274049,13.16624,0.01569227,3.304229,5.290377,...,406.35143,437.316011,15.673248,12.983662,330.814866,150.890695,26.370047,0.03499,0.4617906,0.2607837
std,111.8594,71.96617,80.79689,73.40473,117.257,41.14092,128.5684,0.1904406,58.00246,25.63929,...,155.578203,224.537299,6.305614,2.834988,92.130618,32.062027,3.675944,0.183755,0.4985381,0.4390624
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,100.0,10.0,10.0,100.0,100.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,256.0,252.0,14.0,11.0,321.0,121.0,24.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,...,472.0,277.0,14.0,11.0,321.0,147.0,24.0,0.0,0.0,0.0
75%,3.0,0.0,2.0,0.0,13.0,2.0,3.0,0.0,0.0,1.0,...,539.0,711.0,14.0,15.0,360.0,173.75,32.0,0.0,1.0,1.0
max,4685.0,3257.0,3188.0,3188.0,2918.0,1429.0,5691.0,31.0,2253.0,376.0,...,661.0,854.0,44.0,26.0,549.0,216.0,48.0,1.0,1.0,1.0


In [6]:
del df_train
del df_test

gc.collect()

21

# preprocessing

In [7]:
list_col_constant_val = [col for col in df.columns if len(df[col].unique()) == 1]
list_col_constant_val

In [8]:
df = df.drop(columns=list_col_constant_val)

In [9]:
df["input_rate"] = 1 - df.isna().sum(axis=1) / len(df.columns)
df["input_rate"].describe()

count    1.097231e+06
mean     5.689385e-01
std      1.314793e-01
min      2.201835e-01
25%      4.793578e-01
50%      5.160550e-01
75%      6.376147e-01
max      9.449541e-01
Name: input_rate, dtype: float64

In [10]:
# https://www.kaggle.com/danofer/ieee-fraud-features-xgboost-0-934-lb

START_DATE = "2017-12-01"
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
df["TransactionDT"] = df["TransactionDT"].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

# df["month"] = df["TransactionDT"].dt.month
df["TransactionDT_dow"] = df["TransactionDT"].dt.dayofweek
df["TransactionDT_hour"] = df["TransactionDT"].dt.hour
df["TransactionDT_day"] = df["TransactionDT"].dt.day
df["TransactionDT_part_of_month"] = [ "head" if day < 10 else "middle" if day < 20 else "tail" for day in df["TransactionDT_day"]]

#  df.drop(["TransactionDT"],axis=1,inplace=True)

In [11]:
# #https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
dict_emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
list_us_emails = ['gmail', 'net', 'edu']

for col in ['P_emaildomain', 'R_emaildomain']:
    df[col + '_bin'] = df[col].map(dict_emails)
    df[col + '_suffix'] = df[col].map(lambda x: str(x).split('.')[-1])
    df[col + '_suffix'] = df[col + '_suffix'].map(lambda x: x if str(x) not in list_us_emails else 'us')

In [12]:
list_categorical_feature = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "DeviceType", "DeviceInfo", "id_12", "id_13", "id_14", "id_15", "id_16", "id_17", "id_18", "id_19", "id_20", "id_21", "id_22", "id_23", "id_24", "id_25", "id_26", "id_27", "id_28", "id_29", "id_30", "id_31", "id_32", "id_33", "id_34", "id_35", "id_36", "id_37", "id_38"]

In [13]:
list_categorical_feature.extend(["TransactionDT_dow", "TransactionDT_hour", "TransactionDT_day", "TransactionDT_part_of_month", "P_emaildomain_bin", "P_emaildomain_suffix", "R_emaildomain_bin", "R_emaildomain_suffix"])

In [14]:
for col in list_categorical_feature:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes

# modeling

In [21]:
# X_train = df.query('isTest == 0').drop(columns=["TransactionDT", "isFraud", "isTest"])
X_train = df.drop(columns=["TransactionDT", "isFraud", "isTest"])
# y_train = df.query('isTest == 0')["isTest"]
y_train = df["isTest"]

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, random_state=175, shuffle=True, stratify=y_train)
key_train = X_train["TransactionID"]
X_train = X_train.drop(columns=["TransactionID"])
key_valid = X_valid["TransactionID"]
X_valid = X_valid.drop(columns=["TransactionID"])

X_test = df.query('isTest == 1').drop(columns=["TransactionDT", "isFraud"])
key_test = X_test["TransactionID"]
X_test = X_test.drop(columns=["TransactionID"])

In [22]:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=list_categorical_feature, free_raw_data=False)
eval_data = lgb.Dataset(X_valid, label=y_valid, reference= train_data, categorical_feature=list_categorical_feature, free_raw_data=False)


In [23]:
y_train.value_counts()

0    472432
1    405352
Name: isTest, dtype: int64

In [24]:
y_train.value_counts()[0]

472432

In [25]:
neg_pos_ratio = y_train.value_counts()[0] / y_train.value_counts()[1]
neg_pos_ratio

1.1654857999960528

In [26]:
# def objective(trial):
#     params = {
#         'boosting': 'gbdt', 
#         'objective': 'binary', 
#         'metric':'auc', 
#         'seed': 175, 
#         'max_depth': trial.suggest_int('max_depth', 3, 10), 
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0), 
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0), 
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)
#     }
    
#     gbm = lgb.train(
#         params,
#         train_data,
#         valid_sets=eval_data,
#         num_boost_round=1000,
#         early_stopping_rounds=100
#     )
#     y_pred_prob = gbm.predict(X_valid)
#     fpr, tpr, thresholds = roc_curve(y_valid, y_pred_prob, pos_label=1)
#     score = auc(fpr, tpr)
#     return score * -1

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# print('Number of finished trials: {}'.format(len(study.trials)))

# print('Best trial:')
# trial = study.best_trial

# print('  Value: {}'.format(trial.value))

# print('  Params: ')
# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))

In [27]:
# best_max_depth = trial.params["max_depth"]
# best_learning_rate = trial.params["learning_rate"]
# best_bagging_fraction = trial.params["bagging_fraction"]
# best_feature_fraction = trial.params["feature_fraction"]

In [28]:
# params = {
#     'boosting': 'gbdt', 
#     'objective': 'binary', 
#     'metric':'auc', 
#     'seed': 175, 
#     'max_depth': best_max_depth, 
#     'learning_rate':best_learning_rate, 
#     'bagging_fraction': best_bagging_fraction, 
#     'feature_fraction': best_feature_fraction, 
#     'verbosity': -1
# }

In [29]:
params = {
    'boosting': 'gbdt', 
    'objective': 'binary', 
    'metric':'auc', 
    'seed': 175, 
    'num_leaves': 127, 
    'max_depth': -1, 
    'learning_rate': 0.02, 
    'bagging_fraction': 0.8, 
#     'pos_bagging_fraction': 1,
#     'neg_bagging_fraction': 1 / neg_pos_ratio, 
    'bagging_freq': 1, 
    'feature_fraction': 0.8, 
#     'scale_pos_weight': neg_pos_ratio, 
    'is_unbalance': True, 
    'verbosity': -1
}

In [30]:
evaluation_results = {}
gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, eval_data],
    valid_names=['Train', 'Eval'],
    evals_result=evaluation_results,
    num_boost_round=1000,
    early_stopping_rounds=100, 
    verbose_eval=20
)


Using categorical_feature in Dataset.



MemoryError: 

In [None]:
lgb.plot_importance(gbm, importance_type='gain', max_num_features=30, figsize=(12, 6))

In [None]:
lgb.plot_importance(gbm, importance_type='split', max_num_features=30, figsize=(12, 6))

In [None]:
# lgb.plot_split_value_histogram(gbm, "TransactionAmt")

In [None]:
# lgb.plot_metric(gbm)

In [None]:
y_pred_prob = gbm.predict(X_valid)
y_pred_prob

In [None]:
df_valid = X_valid.copy()
df_valid["isTest"] = y_valid
df_valid["predict_prob"] = y_pred_prob

In [None]:
# sns.catplot(x="predict_prob", y="isTest", data=df_valid, kind='violin')

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
sns.distplot(df_valid.loc[df_valid["isTest"]==0, "predict_prob"], bins=50, hist=True, kde=False, norm_hist=True, color="steelblue", label="not Fruad", ax=ax1)
sns.distplot(df_valid.loc[df_valid["isTest"]==1, "predict_prob"], bins=50, hist=True, kde=False, norm_hist=True, color="darkred", label="Fruad", ax=ax1)

ax2 = fig.add_subplot(212, sharex=ax1)
sns.distplot(df_valid.loc[df_valid["isTest"]==0, "predict_prob"], bins=50, hist=False, kde=True, norm_hist=False, color="steelblue", label="not Fruad", ax=ax2)
sns.distplot(df_valid.loc[df_valid["isTest"]==1, "predict_prob"], bins=50, hist=False, kde=True, norm_hist=False, color="darkred", label="Fruad", ax=ax2)

plt.xlim([0, 1])

In [None]:
sns.distplot(df_valid.loc[df_valid["isTest"]==1, "predict_prob"], bins=50, hist=True, kde=False, norm_hist=False, color="darkred", label="Fruad")

In [None]:
df_valid.loc[df_valid["isTest"]==1, ].sort_values("predict_prob").head()

In [None]:
df_valid.loc[df_valid["isTest"]==1, ].sort_values("predict_prob").tail()

In [None]:
# df_valid["input_rate"] = 1 - df_valid.isna().sum(axis=1) / len(df_valid.columns)
# df_valid["input_rate"].describe()

In [None]:
# sns.lmplot(x="predict_prob", y="input_rate", data=df_valid.query('isFraud == 1'))

In [None]:
# sns.relplot(x="predict_prob", y="input_rate", col="isTest", data=df_valid)

In [None]:
# grid = sns.FacetGrid(df_valid, col="specifyIdentity", row="isTest")
# grid.map(sns.distplot, "predict_prob", hist=False, kde=True)

In [None]:
# def plot_learning_curve(estimator, X, y, scoring_metrix, cv, 
#                         train_sizes=np.linspace(0.1, 1.0, 10), 
#                         title="learning curve", ylim=(0.0, 1.01)):
#     plt.figure()
#     train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, 
#         groups=None, train_sizes=train_sizes, cv=cv, scoring=scoring_metrix, 
#         exploit_incremental_learning=False, n_jobs=-1, pre_dispatch='all', 
#         verbose=0, shuffle=False, random_state=0, error_score='raise-deprecation')
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)

#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 
#         train_scores_mean + train_scores_std, alpha=0.1, color='royalblue')
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 
#         test_scores_mean + test_scores_std, alpha=0.1, color='orange')
#     plt.plot(train_sizes, train_scores_mean, 'o-', color='royalblue', label="training score")
#     plt.plot(test_sizes, test_scores_mean, 'o-', color='royalblue', label="cross-validation score")

#     plt.set_title(title)
#     plt.grid()
#     plt.xlabel("training sample")
#     plt.ylabel("score")
#     plt.ylim(ylim)
#     plt.legend(loc='lower right')

#     return plt

In [None]:
# plot_learning_curve(gbm, X_valid, y_valid, scoring_metrix='roc_auc', cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0))

In [None]:
plt.plot(evaluation_results['Train']['auc'], label='Train')
plt.plot(evaluation_results['Eval']['auc'], label='Eval')
plt.ylabel('Auc')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()

In [None]:
def plot_roc_curve(y, y_pred_prob, title="ROC curve"):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y, y_pred_prob, pos_label=1)
    auc_score = auc(false_positive_rate, true_positive_rate)

    plt.plot(false_positive_rate, true_positive_rate, label="ROC curve (area = %.4f)"%auc_score)
    plt.title(title)
    plt.grid()
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.legend()

    return plt

In [None]:
plot_roc_curve(y_valid, y_pred_prob)

In [None]:
def plot_score_changed_thresholds(y, y_pred_prob, title="model-score changed thresholds"):
    precision, recall, thresholds = precision_recall_curve(y, y_pred_prob, pos_label=1)
    thresholds = np.append(thresholds, 1)
    fscore = 2 * precision * recall / (precision + recall)

    plt.plot(thresholds, precision, linestyle='-', color=sns.color_palette('tab10', 4)[0])
    plt.plot(thresholds, recall, linestyle='--', color=sns.color_palette('tab10', 4)[1])
    plt.plot(thresholds, fscore, linestyle='-.', color=sns.color_palette('tab10', 4)[2])

    leg = plt.legend(("precision", "recall", "f-measure"), loc='upper right', frameon=True)
    leg.get_frame().set_edgecolor('k')
    plt.grid()
    plt.xlim([0, 1])
    plt.title("model-score changed thresholds")
    plt.xlim([0, 1])
    plt.xlabel("thresholds")
    plt.ylim([0, 1])
    plt.ylabel("score")

    return plt


In [None]:
plot_score_changed_thresholds(y_valid, y_pred_prob)

In [None]:
# y_pred = np.where(y_pred_prob > 0.1, 1, 0)
# confusion_matrix(y_valid, y_pred)

In [None]:
sub = pd.DataFrame(pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")['TransactionID'])
sub['isFraud'] = gbm.predict(X_test)
sub.to_csv("submission.csv", index = False)

In [None]:
def write_spreadsheet(*args):
    endpoint = 'https://script.google.com/macros/s/AKfycbxZhZc3lPJ6eLATt_r9dPVQZUjuzpIvQ6vjSYFZdLOlt1TqWvtC/exec'
    requests.post(endpoint, json.dumps(args))

In [None]:
# write_spreadsheet('baseline', *scores)
# write_spreadsheet('baseline', auc)

# explain

In [None]:
# features = ["czrd1", "TransactionAmt", "card2", "C13", "D2"]
# partial_dependence(gbm, X_train, features)

In [None]:
explainer = shap.TreeExplainer(gbm)
shap_values = explainer.shap_values(X_valid)

In [None]:
# df_shap = pd.DataFrame(data=shap_values, index=key_valid, columns=X_valid.columns)
# df_shap.to_csv("shap_value.csv", index = False)

In [None]:
shap.initjs()

In [None]:
shap.summary_plot(shap_values, X_valid)

In [None]:
# sns.relplot(x="predict_prob", y="card1", col="isTest", data=df_valid)

In [None]:
# sns.relplot(x="predict_prob", y="TransactionAmt", col="isTest", data=df_valid)

In [None]:
# shap.dependence_plot("card1", shap_values, X_valid)

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[0,:], X_valid.iloc[0,:])

In [None]:
# index_key = df_valid["predict_prob"].reset_index(drop=True).idxmax()
# shap.force_plot(explainer.expected_value, shap_values[index_key,:], X_valid.iloc[index_key,:])

In [None]:
# index_key = df_valid["predict_prob"].reset_index(drop=True).idxmin()
# shap.force_plot(explainer.expected_value, shap_values[index_key,:], X_valid.iloc[index_key,:])

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[1:1000], X_valid[1:1000])

In [None]:
# shap_interaction_values = explainer.shap_interaction_values(train_X)

In [None]:
# shap.summary_plot(shap_interaction_values, train_X)

In [None]:
def send_line_notification(message):
    line_token = 'tB3RUUxiXOOMB8KjRJXsBEWP2zAnaWzRypXKmQHsiNd'  # 終わったら無効化する
    endpoint = 'https://notify-api.line.me/api/notify'
    message = "\n{}".format(message)
    payload = {'message': message}
    headers = {'Authorization': 'Bearer {}'.format(line_token)}
    requests.post(endpoint, data=payload, headers=headers)

In [None]:
send_line_notification("execution compelete!")