# 群馬銀行業況変化モデル作成と精度評価

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'IPAexGothic'
import seaborn as sns
import re
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
import japanize_matplotlib
import pickle
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 2000

In [46]:
import seaborn as sns
sns.set()
sns.set('talk', 'whitegrid', 'dark', font_scale=1.5, font='IPAexGothic')

In [47]:
# 末尾に"/"を付与すること
PATH_INPUT_RATING = '../data/processed_raw/'
PATH_INPUT = '../data/model_input/'
PATH_OUTPUT = '../results/'

## 集計メソッド

In [48]:
def godcustcd_preprocessing(df):
    '''入力データのGODCUSTCDについての前処理'''
    df = df[df['GODCUSTCD'].isnull()==False]
    df['GODCUSTCD'] = df['GODCUSTCD'].astype(np.int64)
    return df

In [49]:
def convert_trans(df_list, col_before: str, col_after: str):
    df_list_result = []
    for df in df_list:
        df.rename(columns={col_before:col_after}, inplace=True)
        df_list_result.append(df)
    return df_list_result

In [50]:
def merge_input(df_base, df_list, keys):
    for df in df_list:
        df_base = pd.merge(df_base, df, on=keys, how='left')
    df_base = df_base.fillna(0)
    return df_base

## ファイル入力

In [51]:
# 入出金（普通・当座）関連
df_nyukin_amount_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'nyukin_amount_monthly_periods_forplot.csv', index_col=0)
df_nyukin_count_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'nyukin_count_monthly_periods_forplot.csv', index_col=0)
df_shukkin_amount_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'shukkin_amount_monthly_periods_forplot.csv', index_col=0)
df_shukkin_count_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'shukkin_count_monthly_periods_forplot.csv', index_col=0)
df_nyukin_freq = pd.read_csv(PATH_INPUT+'nyukin_freq.csv', index_col=0)
df_shukkin_freq = pd.read_csv(PATH_INPUT+'shukkin_freq.csv', index_col=0)
# 入出金（普通・当座）関連（正規化）
df_nyukin_amount_monthly_periods_forplot_norm = pd.read_csv(PATH_INPUT+'nyukin_amount_monthly_periods_forplot_norm.csv', index_col=0)
df_nyukin_count_monthly_periods_forplot_norm = pd.read_csv(PATH_INPUT+'nyukin_count_monthly_periods_forplot_norm.csv', index_col=0)
df_shukkin_amount_monthly_periods_forplot_norm = pd.read_csv(PATH_INPUT+'shukkin_amount_monthly_periods_forplot_norm.csv', index_col=0)
df_shukkin_count_monthly_periods_forplot_norm = pd.read_csv(PATH_INPUT+'shukkin_count_monthly_periods_forplot_norm.csv', index_col=0)
# df_nyukin_freq_norm = pd.read_csv(PATH_INPUT+'nyukin_freq_norm.csv', index_col=0)
# df_shukkin_freq_norm = pd.read_csv(PATH_INPUT+'shukkin_freq_norm.csv', index_col=0)
# 入出金（普通・当座）関連（標準化）
df_nyukin_amount_monthly_periods_forplot_stand = pd.read_csv(PATH_INPUT+'nyukin_amount_monthly_periods_forplot_stand.csv', index_col=0)
df_nyukin_count_monthly_periods_forplot_stand = pd.read_csv(PATH_INPUT+'nyukin_count_monthly_periods_forplot_stand.csv', index_col=0)
df_shukkin_amount_monthly_periods_forplot_stand = pd.read_csv(PATH_INPUT+'shukkin_amount_monthly_periods_forplot_stand.csv', index_col=0)
df_shukkin_count_monthly_periods_forplot_stand = pd.read_csv(PATH_INPUT+'shukkin_count_monthly_periods_forplot_stand.csv', index_col=0)
# df_nyukin_freq_stand = pd.read_csv(PATH_INPUT+'nyukin_freq_stand.csv', index_col=0)
# df_shukkin_freq_stand = pd.read_csv(PATH_INPUT+'shukkin_freq_stand.csv', index_col=0)
# 摘要込み入出金（普通・当座）関連
df_nyukin_tekiyo_amount_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'nyukin_tekiyo_amount_monthly_periods_forplot.csv', index_col=0)
df_nyukin_tekiyo_count_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'nyukin_tekiyo_count_monthly_periods_forplot.csv', index_col=0)
df_shukkin_tekiyo_amount_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'shukkin_tekiyo_amount_monthly_periods_forplot.csv', index_col=0)
df_shukkin_tekiyo_count_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'shukkin_tekiyo_count_monthly_periods_forplot.csv', index_col=0)
# 月末残高関連
df_zandaka_last_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'zandaka_last_monthly_periods_forplot.csv', index_col=0)
df_zandaka_last_futu_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'zandaka_last_futu_monthly_periods_forplot.csv', index_col=0)
df_zandaka_last_toza_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'zandaka_last_toza_monthly_periods_forplot.csv', index_col=0)
df_zandaka_last_freq = pd.read_csv(PATH_INPUT+'zandaka_last_freq.csv', index_col=0)
df_zandaka_last_futu_freq = pd.read_csv(PATH_INPUT+'zandaka_last_futu_freq.csv', index_col=0)
df_zandaka_last_toza_freq = pd.read_csv(PATH_INPUT+'zandaka_last_toza_freq.csv', index_col=0)
# 月末残高関連（正規化）
df_zandaka_last_futu_monthly_periods_forplot_norm = pd.read_csv(PATH_INPUT+'zandaka_last_futu_monthly_periods_forplot_norm.csv', index_col=0)
# df_zandaka_last_futu_freq_norm = pd.read_csv(PATH_INPUT+'zandaka_last_futu_freq_norm.csv', index_col=0)
# 月末残高関連（標準化）
df_zandaka_last_futu_monthly_periods_forplot_stand = pd.read_csv(PATH_INPUT+'zandaka_last_futu_monthly_periods_forplot_stand.csv', index_col=0)
# df_zandaka_last_futu_freq_stand = pd.read_csv(PATH_INPUT+'zandaka_last_futu_freq.csv_stand', index_col=0)
# 入出金（為替）関連
df_kawase_nyukin_amount_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'kawase_nyukin_amount_monthly_periods_forplot.csv', index_col=0)
df_kawase_shukkin_amount_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'kawase_shukkin_amount_monthly_periods_forplot.csv', index_col=0)
df_kawase_nyukin_count_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'kawase_nyukin_count_monthly_periods_forplot.csv', index_col=0)
df_kawase_shukkin_count_monthly_periods_forplot = pd.read_csv(PATH_INPUT+'kawase_shukkin_count_monthly_periods_forplot.csv', index_col=0)
df_kawase_nyukin_freq = pd.read_csv(PATH_INPUT+'kawase_nyukin_freq.csv')
df_kawase_shukkin_freq = pd.read_csv(PATH_INPUT+'kawase_shukkin_freq.csv')

## 格付けデータの取得

In [52]:
df_rating_zaimu = godcustcd_preprocessing(pd.read_csv(PATH_INPUT_RATING+'rating.csv', index_col=0))

In [53]:
print(df_rating_zaimu.shape)
df_rating_zaimu.head()

(6725, 9)


Unnamed: 0,GODCUSTCD,RECKSYM,recksym_kakuzuke_zenki,recksym_kakuzuke,recksym_henkobi,recksym_kakuzuke_zenki_kigo,recksym_kakuzuke_kigo,rekka_flg,fold_id
0,6316050371,201605,14,14,20161202,A4,A4,False,3
1,6316050371,201705,14,15,20171030,A4,A5,False,3
2,6316050371,201805,15,13,20181030,A5,A3,False,3
3,6316050390,201603,13,13,20160816,A3,A3,False,3
4,6316050390,201703,13,15,20171002,A3,A5,False,3


## インプットデータ作成

In [54]:
# zaimu_lagは、いつから直近１２か月持ってくるかに対応するパラメータ。
zaimu_lag = -3
df_zaimu_date = pd.to_datetime(df_rating_zaimu['RECKSYM']*100+1, format='%Y%m%d')
df_zaimu_date = df_zaimu_date + pd.DateOffset(months=zaimu_lag)
df_rating_zaimu['RECKTRANS'] = df_zaimu_date.map(lambda x : 100*x.year + x.month)

In [102]:
# モデル作成に使用するデータ作成
df_list = [
#     df_nyukin_amount_monthly_periods_forplot,
#     df_nyukin_count_monthly_periods_forplot,
#     df_shukkin_amount_monthly_periods_forplot,
#     df_shukkin_count_monthly_periods_forplot,
    df_nyukin_freq,
    df_shukkin_freq,
    df_nyukin_amount_monthly_periods_forplot_norm,
    df_nyukin_count_monthly_periods_forplot_norm,
    df_shukkin_amount_monthly_periods_forplot_norm,
    df_shukkin_count_monthly_periods_forplot_norm,
#     df_nyukin_amount_monthly_periods_forplot_stand,
#     df_nyukin_count_monthly_periods_forplot_stand,
#     df_shukkin_amount_monthly_periods_forplot_stand,
#     df_shukkin_count_monthly_periods_forplot_stand,
#     df_nyukin_freq_norm,
#     df_shukkin_freq_norm,
#     df_nyukin_tekiyo_amount_monthly_periods_forplot,
#     df_nyukin_tekiyo_count_monthly_periods_forplot,
#     df_shukkin_tekiyo_amount_monthly_periods_forplot,
#     df_shukkin_tekiyo_count_monthly_periods_forplot,
#     df_zandaka_last_monthly_periods_forplot,
#     df_zandaka_last_futu_monthly_periods_forplot,
    df_zandaka_last_futu_monthly_periods_forplot_norm,
#     df_zandaka_last_futu_monthly_periods_forplot_stand,
#     df_zandaka_last_toza_monthly_periods_forplot,
#     df_zandaka_last_freq,
#     df_zandaka_last_futu_freq_norm,
#     df_zandaka_last_futu_freq_norm,
#     df_zandaka_last_toza_freq,
#     df_kawase_nyukin_amount_monthly_periods_forplot,
#     df_kawase_shukkin_amount_monthly_periods_forplot,
#     df_kawase_nyukin_count_monthly_periods_forplot,
#     df_kawase_shukkin_count_monthly_periods_forplot,
#     df_kawase_nyukin_freq,
#     df_kawase_shukkin_freq
]
df_all = merge_input(df_rating_zaimu, convert_trans(df_list, 'YYYYMM', 'RECKTRANS'), ['GODCUSTCD','RECKTRANS'])

In [21]:
# 前期格付を特徴量に入れる場合は実行
kigo = ['0','A0','A1','A2','A3','A4','A5','A6']
flg = [7,0,1,2,3,4,5,6]
df_all['前期格付'] = df_all['recksym_kakuzuke_zenki_kigo'].apply(lambda x: flg[kigo.index(x)])

In [22]:
print(df_all.shape[0]) # 格付追加前: 4346, 格付追加後: 6725
df_all.head()

6725


Unnamed: 0,GODCUSTCD,RECKSYM,recksym_kakuzuke_zenki,recksym_kakuzuke,recksym_henkobi,recksym_kakuzuke_zenki_kigo,recksym_kakuzuke_kigo,rekka_flg,fold_id,RECKTRANS,YOKIN_RECEIVE_AMOUNT_MEAN12,YOKIN_RECEIVE_COUNT_MEAN12,YOKIN_PAY_AMOUNT_MEAN12,YOKIN_PAY_COUNT_MEAN12,0_預金_入金_金額,1_預金_入金_金額,2_預金_入金_金額,3_預金_入金_金額,4_預金_入金_金額,5_預金_入金_金額,6_預金_入金_金額,7_預金_入金_金額,8_預金_入金_金額,9_預金_入金_金額,10_預金_入金_金額,11_預金_入金_金額,0_預金_入金_件数,1_預金_入金_件数,2_預金_入金_件数,3_預金_入金_件数,4_預金_入金_件数,5_預金_入金_件数,6_預金_入金_件数,7_預金_入金_件数,8_預金_入金_件数,9_預金_入金_件数,10_預金_入金_件数,11_預金_入金_件数,0_預金_出金_金額,1_預金_出金_金額,2_預金_出金_金額,3_預金_出金_金額,4_預金_出金_金額,5_預金_出金_金額,6_預金_出金_金額,7_預金_出金_金額,8_預金_出金_金額,9_預金_出金_金額,10_預金_出金_金額,11_預金_出金_金額,0_預金_出金_件数,1_預金_出金_件数,2_預金_出金_件数,3_預金_出金_件数,4_預金_出金_件数,5_預金_出金_件数,6_預金_出金_件数,7_預金_出金_件数,8_預金_出金_件数,9_預金_出金_件数,10_預金_出金_件数,11_預金_出金_件数,0_月末残高_普通,1_月末残高_普通,2_月末残高_普通,3_月末残高_普通,4_月末残高_普通,5_月末残高_普通,6_月末残高_普通,7_月末残高_普通,8_月末残高_普通,9_月末残高_普通,10_月末残高_普通,11_月末残高_普通,ZANDAKA_LAST_FUTU_MEAN12,前期格付
0,6316050371,201605,14,14,20161202,A4,A4,False,3,201602,142544500.0,63.416667,134542500.0,65.083333,0.309379,0.402722,0.413674,0.237665,0.415987,0.188809,1.0,0.501447,0.052528,0.228235,0.114766,0.233668,0.930233,1.0,0.755814,0.651163,0.604651,0.802326,0.697674,0.732558,0.732558,0.72093,0.593023,0.627907,0.293356,0.450809,0.401411,0.309526,0.395084,0.259709,1.0,0.250077,0.185786,0.256498,0.144042,0.241772,0.942857,0.957143,1.0,0.928571,0.957143,0.942857,0.957143,0.885714,0.857143,0.928571,0.885714,0.914286,0.001008,0.001008,0.001008,0.001008,0.001008,0.001008,0.001008,0.999947,0.999947,0.999947,0.999947,1.0,39769070.0,4
1,6316050371,201705,14,15,20171030,A4,A5,False,3,201702,145470100.0,57.25,151695500.0,61.0,1.0,0.903294,0.964535,0.389743,0.611038,0.090167,0.16111,0.276521,0.387339,0.157787,0.526023,0.264512,0.786517,1.0,0.617978,0.550562,0.505618,0.629213,0.629213,0.550562,0.674157,0.505618,0.662921,0.606742,0.766804,1.0,0.496115,0.853482,0.362827,0.493238,0.404793,0.241279,0.159293,0.379775,0.449613,0.219638,0.925373,0.925373,1.0,0.985075,0.925373,0.985075,0.925373,0.910448,0.791045,0.880597,0.835821,0.835821,1.0,1.0,1.0,0.001084,0.001084,0.001094,0.0,0.0,0.0,0.0,0.0,0.0,23854730.0,4
2,6316050371,201805,15,13,20181030,A5,A3,False,3,201802,128841200.0,55.083333,130480100.0,61.5,0.553654,0.356657,0.403896,0.652494,0.520228,0.18314,1.0,0.147444,0.215441,0.217957,0.274237,0.229957,0.771429,1.0,0.685714,0.571429,0.7,0.857143,0.914286,0.7,0.814286,0.785714,0.928571,0.714286,0.745448,0.644158,0.679285,0.851732,0.642907,0.580256,0.784878,1.0,0.248427,0.31372,0.440284,0.365769,0.880597,0.910448,1.0,0.970149,0.895522,0.940299,0.850746,0.940299,0.910448,0.895522,0.940299,0.880597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,6316050390,201603,13,13,20160816,A3,A3,False,3,201512,279093800.0,60.583333,276167700.0,47.083333,0.64791,1.0,0.366253,0.357559,0.760441,0.498097,0.573377,0.38287,0.475667,0.523557,0.600425,0.253839,0.706667,0.84,0.853333,0.813333,0.733333,0.853333,0.853333,0.826667,0.826667,0.586667,1.0,0.8,0.476503,1.0,0.400116,0.343557,0.736927,0.495561,0.50898,0.37903,0.387537,0.615696,0.588922,0.212709,0.810345,0.948276,0.844828,0.793103,0.775862,1.0,0.844828,0.758621,0.793103,0.741379,0.827586,0.603448,0.127506,0.693794,0.564204,0.428313,0.756837,0.428812,0.311388,0.641909,0.530847,0.66141,1.0,0.439224,11596750.0,3
4,6316050390,201703,13,15,20171002,A3,A5,False,3,201612,210089000.0,57.5,208680100.0,44.25,0.341328,0.42741,0.559791,0.275985,0.461727,0.208327,0.236491,1.0,0.76112,0.332709,0.462249,0.639205,0.852941,0.882353,1.0,0.823529,0.970588,0.852941,0.735294,0.867647,0.897059,0.75,0.661765,0.852941,0.394524,0.437878,0.497611,0.207538,0.305822,0.207334,0.28797,0.356993,1.0,0.520794,0.421213,0.556813,0.803922,1.0,0.901961,0.72549,1.0,0.862745,0.666667,1.0,0.882353,0.901961,0.803922,0.862745,0.342659,0.161558,0.128547,0.910732,1.0,0.737946,0.643671,0.398333,0.267657,0.175903,0.158956,0.374822,9912672.0,3


## インプットデータ足切り

In [103]:
th_num = 0.999
df_all_selected = df_all[(df_all['YOKIN_PAY_COUNT_MEAN12']>=th_num) & (df_all['YOKIN_RECEIVE_COUNT_MEAN12']>=th_num)]
df_all_selected = df_all_selected[df_all_selected.isnull().any(axis=1) == False]

### ※預金の年次集計値を使用しない場合に実行

In [104]:
df_all_selected = df_all_selected.drop(['YOKIN_PAY_AMOUNT_MEAN12', 'YOKIN_RECEIVE_AMOUNT_MEAN12',
                                        'YOKIN_PAY_COUNT_MEAN12', 'YOKIN_RECEIVE_COUNT_MEAN12'], axis=1)

## 足切り後インプットデータ

In [105]:
# データ数
print('データ数:{0}'.format(len(df_all_selected)))
print('正常データ数:{0}'.format(len(df_all_selected[df_all_selected['rekka_flg']==False])))
print('劣化データ数:{0}'.format(len(df_all_selected[df_all_selected['rekka_flg']==True])))

# 企業数
print('企業数:{0}'.format(len(df_all_selected['GODCUSTCD'].unique())))
print('正常企業数:{0}'.format(len(df_all_selected[df_all_selected['rekka_flg']==False]['GODCUSTCD'].unique())))
print('劣化企業数:{0}'.format(len(df_all_selected[df_all_selected['rekka_flg']==True]['GODCUSTCD'].unique())))

print(df_all_selected.iloc[:,10:].shape)
display(df_all_selected.head())
display(df_all_selected.iloc[:,10:].head())

データ数:3710
正常データ数:3550
劣化データ数:160
企業数:1678
正常企業数:1606
劣化企業数:159
(3710, 60)


Unnamed: 0,GODCUSTCD,RECKSYM,recksym_kakuzuke_zenki,recksym_kakuzuke,recksym_henkobi,recksym_kakuzuke_zenki_kigo,recksym_kakuzuke_kigo,rekka_flg,fold_id,RECKTRANS,0_預金_入金_金額,1_預金_入金_金額,2_預金_入金_金額,3_預金_入金_金額,4_預金_入金_金額,5_預金_入金_金額,6_預金_入金_金額,7_預金_入金_金額,8_預金_入金_金額,9_預金_入金_金額,10_預金_入金_金額,11_預金_入金_金額,0_預金_入金_件数,1_預金_入金_件数,2_預金_入金_件数,3_預金_入金_件数,4_預金_入金_件数,5_預金_入金_件数,6_預金_入金_件数,7_預金_入金_件数,8_預金_入金_件数,9_預金_入金_件数,10_預金_入金_件数,11_預金_入金_件数,0_預金_出金_金額,1_預金_出金_金額,2_預金_出金_金額,3_預金_出金_金額,4_預金_出金_金額,5_預金_出金_金額,6_預金_出金_金額,7_預金_出金_金額,8_預金_出金_金額,9_預金_出金_金額,10_預金_出金_金額,11_預金_出金_金額,0_預金_出金_件数,1_預金_出金_件数,2_預金_出金_件数,3_預金_出金_件数,4_預金_出金_件数,5_預金_出金_件数,6_預金_出金_件数,7_預金_出金_件数,8_預金_出金_件数,9_預金_出金_件数,10_預金_出金_件数,11_預金_出金_件数,0_月末残高_普通,1_月末残高_普通,2_月末残高_普通,3_月末残高_普通,4_月末残高_普通,5_月末残高_普通,6_月末残高_普通,7_月末残高_普通,8_月末残高_普通,9_月末残高_普通,10_月末残高_普通,11_月末残高_普通
0,6316050371,201605,14,14,20161202,A4,A4,False,3,201602,0.309379,0.402722,0.413674,0.237665,0.415987,0.188809,1.0,0.501447,0.052528,0.228235,0.114766,0.233668,0.930233,1.0,0.755814,0.651163,0.604651,0.802326,0.697674,0.732558,0.732558,0.72093,0.593023,0.627907,0.293356,0.450809,0.401411,0.309526,0.395084,0.259709,1.0,0.250077,0.185786,0.256498,0.144042,0.241772,0.942857,0.957143,1.0,0.928571,0.957143,0.942857,0.957143,0.885714,0.857143,0.928571,0.885714,0.914286,0.001008,0.001008,0.001008,0.001008,0.001008,0.001008,0.001008,0.999947,0.999947,0.999947,0.999947,1.0
1,6316050371,201705,14,15,20171030,A4,A5,False,3,201702,1.0,0.903294,0.964535,0.389743,0.611038,0.090167,0.16111,0.276521,0.387339,0.157787,0.526023,0.264512,0.786517,1.0,0.617978,0.550562,0.505618,0.629213,0.629213,0.550562,0.674157,0.505618,0.662921,0.606742,0.766804,1.0,0.496115,0.853482,0.362827,0.493238,0.404793,0.241279,0.159293,0.379775,0.449613,0.219638,0.925373,0.925373,1.0,0.985075,0.925373,0.985075,0.925373,0.910448,0.791045,0.880597,0.835821,0.835821,1.0,1.0,1.0,0.001084,0.001084,0.001094,0.0,0.0,0.0,0.0,0.0,0.0
2,6316050371,201805,15,13,20181030,A5,A3,False,3,201802,0.553654,0.356657,0.403896,0.652494,0.520228,0.18314,1.0,0.147444,0.215441,0.217957,0.274237,0.229957,0.771429,1.0,0.685714,0.571429,0.7,0.857143,0.914286,0.7,0.814286,0.785714,0.928571,0.714286,0.745448,0.644158,0.679285,0.851732,0.642907,0.580256,0.784878,1.0,0.248427,0.31372,0.440284,0.365769,0.880597,0.910448,1.0,0.970149,0.895522,0.940299,0.850746,0.940299,0.910448,0.895522,0.940299,0.880597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6316050390,201603,13,13,20160816,A3,A3,False,3,201512,0.64791,1.0,0.366253,0.357559,0.760441,0.498097,0.573377,0.38287,0.475667,0.523557,0.600425,0.253839,0.706667,0.84,0.853333,0.813333,0.733333,0.853333,0.853333,0.826667,0.826667,0.586667,1.0,0.8,0.476503,1.0,0.400116,0.343557,0.736927,0.495561,0.50898,0.37903,0.387537,0.615696,0.588922,0.212709,0.810345,0.948276,0.844828,0.793103,0.775862,1.0,0.844828,0.758621,0.793103,0.741379,0.827586,0.603448,0.127506,0.693794,0.564204,0.428313,0.756837,0.428812,0.311388,0.641909,0.530847,0.66141,1.0,0.439224
4,6316050390,201703,13,15,20171002,A3,A5,False,3,201612,0.341328,0.42741,0.559791,0.275985,0.461727,0.208327,0.236491,1.0,0.76112,0.332709,0.462249,0.639205,0.852941,0.882353,1.0,0.823529,0.970588,0.852941,0.735294,0.867647,0.897059,0.75,0.661765,0.852941,0.394524,0.437878,0.497611,0.207538,0.305822,0.207334,0.28797,0.356993,1.0,0.520794,0.421213,0.556813,0.803922,1.0,0.901961,0.72549,1.0,0.862745,0.666667,1.0,0.882353,0.901961,0.803922,0.862745,0.342659,0.161558,0.128547,0.910732,1.0,0.737946,0.643671,0.398333,0.267657,0.175903,0.158956,0.374822


Unnamed: 0,0_預金_入金_金額,1_預金_入金_金額,2_預金_入金_金額,3_預金_入金_金額,4_預金_入金_金額,5_預金_入金_金額,6_預金_入金_金額,7_預金_入金_金額,8_預金_入金_金額,9_預金_入金_金額,10_預金_入金_金額,11_預金_入金_金額,0_預金_入金_件数,1_預金_入金_件数,2_預金_入金_件数,3_預金_入金_件数,4_預金_入金_件数,5_預金_入金_件数,6_預金_入金_件数,7_預金_入金_件数,8_預金_入金_件数,9_預金_入金_件数,10_預金_入金_件数,11_預金_入金_件数,0_預金_出金_金額,1_預金_出金_金額,2_預金_出金_金額,3_預金_出金_金額,4_預金_出金_金額,5_預金_出金_金額,6_預金_出金_金額,7_預金_出金_金額,8_預金_出金_金額,9_預金_出金_金額,10_預金_出金_金額,11_預金_出金_金額,0_預金_出金_件数,1_預金_出金_件数,2_預金_出金_件数,3_預金_出金_件数,4_預金_出金_件数,5_預金_出金_件数,6_預金_出金_件数,7_預金_出金_件数,8_預金_出金_件数,9_預金_出金_件数,10_預金_出金_件数,11_預金_出金_件数,0_月末残高_普通,1_月末残高_普通,2_月末残高_普通,3_月末残高_普通,4_月末残高_普通,5_月末残高_普通,6_月末残高_普通,7_月末残高_普通,8_月末残高_普通,9_月末残高_普通,10_月末残高_普通,11_月末残高_普通
0,0.309379,0.402722,0.413674,0.237665,0.415987,0.188809,1.0,0.501447,0.052528,0.228235,0.114766,0.233668,0.930233,1.0,0.755814,0.651163,0.604651,0.802326,0.697674,0.732558,0.732558,0.72093,0.593023,0.627907,0.293356,0.450809,0.401411,0.309526,0.395084,0.259709,1.0,0.250077,0.185786,0.256498,0.144042,0.241772,0.942857,0.957143,1.0,0.928571,0.957143,0.942857,0.957143,0.885714,0.857143,0.928571,0.885714,0.914286,0.001008,0.001008,0.001008,0.001008,0.001008,0.001008,0.001008,0.999947,0.999947,0.999947,0.999947,1.0
1,1.0,0.903294,0.964535,0.389743,0.611038,0.090167,0.16111,0.276521,0.387339,0.157787,0.526023,0.264512,0.786517,1.0,0.617978,0.550562,0.505618,0.629213,0.629213,0.550562,0.674157,0.505618,0.662921,0.606742,0.766804,1.0,0.496115,0.853482,0.362827,0.493238,0.404793,0.241279,0.159293,0.379775,0.449613,0.219638,0.925373,0.925373,1.0,0.985075,0.925373,0.985075,0.925373,0.910448,0.791045,0.880597,0.835821,0.835821,1.0,1.0,1.0,0.001084,0.001084,0.001094,0.0,0.0,0.0,0.0,0.0,0.0
2,0.553654,0.356657,0.403896,0.652494,0.520228,0.18314,1.0,0.147444,0.215441,0.217957,0.274237,0.229957,0.771429,1.0,0.685714,0.571429,0.7,0.857143,0.914286,0.7,0.814286,0.785714,0.928571,0.714286,0.745448,0.644158,0.679285,0.851732,0.642907,0.580256,0.784878,1.0,0.248427,0.31372,0.440284,0.365769,0.880597,0.910448,1.0,0.970149,0.895522,0.940299,0.850746,0.940299,0.910448,0.895522,0.940299,0.880597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.64791,1.0,0.366253,0.357559,0.760441,0.498097,0.573377,0.38287,0.475667,0.523557,0.600425,0.253839,0.706667,0.84,0.853333,0.813333,0.733333,0.853333,0.853333,0.826667,0.826667,0.586667,1.0,0.8,0.476503,1.0,0.400116,0.343557,0.736927,0.495561,0.50898,0.37903,0.387537,0.615696,0.588922,0.212709,0.810345,0.948276,0.844828,0.793103,0.775862,1.0,0.844828,0.758621,0.793103,0.741379,0.827586,0.603448,0.127506,0.693794,0.564204,0.428313,0.756837,0.428812,0.311388,0.641909,0.530847,0.66141,1.0,0.439224
4,0.341328,0.42741,0.559791,0.275985,0.461727,0.208327,0.236491,1.0,0.76112,0.332709,0.462249,0.639205,0.852941,0.882353,1.0,0.823529,0.970588,0.852941,0.735294,0.867647,0.897059,0.75,0.661765,0.852941,0.394524,0.437878,0.497611,0.207538,0.305822,0.207334,0.28797,0.356993,1.0,0.520794,0.421213,0.556813,0.803922,1.0,0.901961,0.72549,1.0,0.862745,0.666667,1.0,0.882353,0.901961,0.803922,0.862745,0.342659,0.161558,0.128547,0.910732,1.0,0.737946,0.643671,0.398333,0.267657,0.175903,0.158956,0.374822


In [106]:
# 格付追加後
assert len(df_all_selected) == 3710
assert len(df_all_selected[df_all_selected['rekka_flg']==False]) == 3550
assert len(df_all_selected[df_all_selected['rekka_flg']==True]) == 160
assert len(df_all_selected['GODCUSTCD'].unique()) == 1678
assert len(df_all_selected[df_all_selected['rekka_flg']==False]['GODCUSTCD'].unique()) == 1606
assert len(df_all_selected[df_all_selected['rekka_flg']==True]['GODCUSTCD'].unique()) == 159

## モデル作成

In [107]:
# 学習/テストデータ分割
def split_train_test(df, train_ids, under_sample_seed=None):
    df_train = df[df['fold_id'].isin(train_ids)]
    if(under_sample_seed is not None):
        df_train = train_under_sample(df_train, under_sample_seed)
    X_train = df_train.iloc[:,10:]
    y_train = df_train['rekka_flg']
    df_test = df[df['fold_id'].isin(train_ids)==False]
    X_test = df_test.iloc[:,10:]
    y_test = df_test['rekka_flg']
    
    return df_train, df_test, X_train, y_train, X_test, y_test

# アンダーサンプリング
def train_under_sample(df_train, random_seed=1):
    df_train_seijo = df_train[df_train['rekka_flg']==0]
    df_train_rekka = df_train[df_train['rekka_flg']==1]  
    np.random.seed(random_seed)
    choice = np.random.choice(len(df_train_seijo), len(df_train_rekka), replace=False)
    df_train_sample = pd.concat([df_train_rekka, df_train_seijo.iloc[choice,:]])
    return df_train_sample

In [108]:
# 4-foldチューニング
kfolds=[[1,2,3],[2,3,4],[3,4,1],[4,1,2]]
df_results = pd.DataFrame()
models = []
X_tests = []
for tar in kfolds:
    df_train, df_test, X_train, y_train, X_test, y_test = split_train_test(df_all_selected, tar)
    model = lgb.LGBMClassifier(class_weight='balanced')
    model.fit(X_train.values, y_train.values)
    models.append(model)
    X_tests.append(X_test)
    df_test['rekka_score'] = model.predict_proba(X_test.values)[:,1]
    df_results = pd.concat([df_results, df_test])

In [109]:
df_results.head()

Unnamed: 0,GODCUSTCD,RECKSYM,recksym_kakuzuke_zenki,recksym_kakuzuke,recksym_henkobi,recksym_kakuzuke_zenki_kigo,recksym_kakuzuke_kigo,rekka_flg,fold_id,RECKTRANS,0_預金_入金_金額,1_預金_入金_金額,2_預金_入金_金額,3_預金_入金_金額,4_預金_入金_金額,5_預金_入金_金額,6_預金_入金_金額,7_預金_入金_金額,8_預金_入金_金額,9_預金_入金_金額,10_預金_入金_金額,11_預金_入金_金額,0_預金_入金_件数,1_預金_入金_件数,2_預金_入金_件数,3_預金_入金_件数,4_預金_入金_件数,5_預金_入金_件数,6_預金_入金_件数,7_預金_入金_件数,8_預金_入金_件数,9_預金_入金_件数,10_預金_入金_件数,11_預金_入金_件数,0_預金_出金_金額,1_預金_出金_金額,2_預金_出金_金額,3_預金_出金_金額,4_預金_出金_金額,5_預金_出金_金額,6_預金_出金_金額,7_預金_出金_金額,8_預金_出金_金額,9_預金_出金_金額,10_預金_出金_金額,11_預金_出金_金額,0_預金_出金_件数,1_預金_出金_件数,2_預金_出金_件数,3_預金_出金_件数,4_預金_出金_件数,5_預金_出金_件数,6_預金_出金_件数,7_預金_出金_件数,8_預金_出金_件数,9_預金_出金_件数,10_預金_出金_件数,11_預金_出金_件数,0_月末残高_普通,1_月末残高_普通,2_月末残高_普通,3_月末残高_普通,4_月末残高_普通,5_月末残高_普通,6_月末残高_普通,7_月末残高_普通,8_月末残高_普通,9_月末残高_普通,10_月末残高_普通,11_月末残高_普通,rekka_score
12,6316050673,201703,15,16,20180327,A5,A6,False,4,201612,0.362464,0.362472,0.342327,0.402738,0.32219,0.342327,0.354409,1.0,0.0,0.100684,0.362464,0.374546,0.333333,0.666667,0.333333,0.333333,0.333333,0.333333,0.333333,1.0,0.0,0.333333,0.333333,0.333333,0.947796,0.932069,0.942641,0.949982,0.946721,0.921796,0.93344,0.935842,0.92068,1.0,0.987052,0.982767,0.4,0.4,0.4,1.0,0.4,0.4,0.4,0.4,0.4,0.6,0.4,0.4,0.027911,0.039542,0.013712,0.077176,0.017729,0.004203,0.002529,1.0,0.45658,0.022387,0.001553,0.001974,0.057582
20,6316050959,201601,13,13,20160527,A3,A3,False,4,201510,0.54604,0.562954,0.478738,0.469202,1.0,0.635681,0.497207,0.425892,0.63674,0.605929,0.800565,0.749204,0.875536,1.0,0.83691,0.845494,0.832618,0.922747,0.76824,0.875536,0.841202,0.88412,0.888412,0.832618,0.398085,0.397884,0.463554,0.278969,1.0,0.373191,0.357261,0.446218,0.304221,0.494931,0.500195,0.527298,0.80303,0.939394,0.909091,0.80303,0.924242,0.954545,0.848485,0.969697,0.848485,0.939394,0.909091,1.0,0.893746,0.935279,0.832683,0.887573,0.666934,0.773275,0.775071,0.742343,0.892454,0.862091,0.968398,1.0,0.005382
21,6316050959,201701,13,13,20170602,A3,A3,False,4,201610,0.841629,1.0,0.46398,0.727851,0.827257,0.833377,0.692647,0.493261,0.767295,0.799495,0.875649,0.850043,0.90678,0.995763,0.800847,0.860169,0.877119,0.851695,0.834746,0.809322,0.868644,1.0,0.923729,0.889831,0.526337,1.0,0.557275,0.577379,0.748574,0.549599,0.697236,0.322021,0.606085,0.445783,0.869735,0.631521,0.895522,1.0,0.850746,0.940299,0.80597,0.910448,0.910448,0.925373,0.850746,0.970149,0.895522,0.955224,1.0,0.832975,0.805947,0.802213,0.780083,0.834102,0.749461,0.844597,0.853469,0.991134,0.92832,0.981181,0.001787
22,6316050959,201801,13,13,20180530,A3,A3,False,4,201710,0.685414,0.626635,0.565953,0.462439,1.0,0.748207,0.701138,0.626312,0.747924,0.797571,0.859976,0.806836,0.891304,0.982609,0.882609,0.917391,1.0,0.934783,0.947826,0.986957,0.969565,0.956522,0.904348,0.926087,0.646449,0.526491,0.629606,0.429215,0.823174,0.774601,0.847812,0.339936,0.77255,0.598616,1.0,0.683795,0.861111,0.902778,0.847222,0.875,0.847222,0.958333,1.0,0.875,0.902778,0.819444,0.833333,0.972222,0.95249,0.977659,0.932287,0.967612,0.938082,0.964455,0.832441,1.0,0.918528,0.971441,0.888843,0.883092,0.000574
32,6316051308,201602,14,14,20160901,A4,A4,False,4,201511,0.673329,0.665315,0.494656,0.605254,0.473558,0.660351,0.81056,0.584803,0.481162,0.658124,0.55602,1.0,0.793814,0.740206,1.0,0.795876,0.752577,0.713402,0.789691,0.769072,0.853608,0.709278,0.727835,0.76701,0.82505,0.68717,0.705994,0.484747,0.858837,0.692362,1.0,0.714213,0.710223,0.702456,0.684848,0.736853,0.866667,0.741667,0.95,0.808333,0.683333,0.658333,1.0,0.775,0.7,0.758333,0.55,0.7,0.051611,0.260291,0.095393,0.515546,0.057538,0.247733,0.234867,0.237508,0.038574,0.207737,0.200752,1.0,0.027037


## 前期格付のみの精度

In [29]:
pd.crosstab(df_all_selected['recksym_kakuzuke_zenki_kigo'], df_all_selected['recksym_kakuzuke_kigo'])

recksym_kakuzuke_kigo,0,A1,A2,A3,A4,A5,A6,B1,B2,B3,B4,C,D,E
recksym_kakuzuke_zenki_kigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,5,0,0,1,34,28,77,15,4,11,4,10,11,0
A1,0,23,0,0,0,0,0,0,0,0,0,0,0,0
A2,0,1,211,1,0,0,0,0,0,0,0,0,0,0
A3,0,1,13,559,35,13,3,0,0,0,0,0,0,0
A4,0,0,0,35,580,97,35,3,0,0,0,0,0,0
A5,0,0,0,10,105,357,96,4,2,3,1,0,0,0
A6,4,0,0,2,51,115,1058,49,7,29,5,1,0,1


In [30]:
round(roc_auc_score(df_all_selected['rekka_flg'], df_all_selected['前期格付']),10)

0.82843838030000005

## 全体の精度

In [110]:
from scipy.optimize import bisect

# 正常再現率を90%とする閾値を決定
def calc_diff_90percent(df):
    def _curried(thres):
        df['pred_rekka'] = (df['rekka_score'] > thres).astype(int)
        seijo_recall = len(df[(df['rekka_flg'] == 0) & (df['pred_rekka'] == 0)]) / len(df[df['rekka_flg'] == 0])
        diff = seijo_recall - 0.9
        return diff
    return _curried

# 閾値計算
def calc_thres(df):
    return round(bisect(calc_diff_90percent(df), 0, 1), 6)

In [111]:
auc_fold_1 = round(roc_auc_score(df_results.query('fold_id==1')['rekka_flg'], df_results.query('fold_id==1')['rekka_score']),10)
auc_fold_2 = round(roc_auc_score(df_results.query('fold_id==2')['rekka_flg'], df_results.query('fold_id==2')['rekka_score']),10)
auc_fold_3 = round(roc_auc_score(df_results.query('fold_id==3')['rekka_flg'], df_results.query('fold_id==3')['rekka_score']),10)
auc_fold_4 = round(roc_auc_score(df_results.query('fold_id==4')['rekka_flg'], df_results.query('fold_id==4')['rekka_score']),10)
auc_lis = [auc_fold_1, auc_fold_2, auc_fold_3, auc_fold_4]
print('AUC fold_id 1: {0}'.format(auc_fold_1))
print('AUC fold_id 2: {0}'.format(auc_fold_2))
print('AUC fold_id 3: {0}'.format(auc_fold_3))
print('AUC fold_id 4: {0}'.format(auc_fold_4))
print('AUC mean : {0}'.format(np.mean(auc_lis)))
print('AUC std : {0}'.format(np.std(auc_lis)))

AUC fold_id 1: 0.6050125184
AUC fold_id 2: 0.6102234163
AUC fold_id 3: 0.6245764029
AUC fold_id 4: 0.6004024541
AUC mean : 0.610053697925
AUC std : 0.009076036614219097


In [112]:
thres_fold_1 = calc_thres(df_results.query('fold_id==1'))
thres_fold_2 = calc_thres(df_results.query('fold_id==2'))
thres_fold_3 = calc_thres(df_results.query('fold_id==3'))
thres_fold_4 = calc_thres(df_results.query('fold_id==4'))
cr_fold_1 = classification_report(df_results.query('fold_id==1')['rekka_flg'], 
                                  df_results.query('fold_id==1')['rekka_score']>=thres_fold_1, digits=3)
cr_fold_2 = classification_report(df_results.query('fold_id==2')['rekka_flg'], 
                                  df_results.query('fold_id==2')['rekka_score']>=thres_fold_2, digits=3)
cr_fold_3 = classification_report(df_results.query('fold_id==3')['rekka_flg'], 
                                  df_results.query('fold_id==3')['rekka_score']>=thres_fold_3, digits=3)
cr_fold_4 = classification_report(df_results.query('fold_id==4')['rekka_flg'], 
                                  df_results.query('fold_id==4')['rekka_score']>=thres_fold_4, digits=3)
print('Threshold: {0}, \nClassification Report fold_id 1: \n{1}'.format(thres_fold_1, cr_fold_1))
print('Threshold: {0}, \nClassification Report fold_id 2: \n{1}'.format(thres_fold_2, cr_fold_2))
print('Threshold: {0}, \nClassification Report fold_id 3: \n{1}'.format(thres_fold_3, cr_fold_3))
print('Threshold: {0}, \nClassification Report fold_id 4: \n{1}'.format(thres_fold_4, cr_fold_4))

Threshold: 0.037759, 
Classification Report fold_id 1: 
             precision    recall  f1-score   support

      False      0.959     0.900     0.928       901
       True      0.082     0.186     0.113        43

avg / total      0.919     0.868     0.891       944

Threshold: 0.033245, 
Classification Report fold_id 2: 
             precision    recall  f1-score   support

      False      0.964     0.900     0.931       884
       True      0.102     0.250     0.145        40

avg / total      0.926     0.872     0.897       924

Threshold: 0.035403, 
Classification Report fold_id 3: 
             precision    recall  f1-score   support

      False      0.966     0.901     0.932       919
       True      0.062     0.171     0.091        35

avg / total      0.933     0.874     0.902       954

Threshold: 0.028518, 
Classification Report fold_id 4: 
             precision    recall  f1-score   support

      False      0.957     0.900     0.927       846
       True      0.086  

In [113]:
sr_fi_1 = pd.Series(models[0].feature_importances_, index=df_all_selected.iloc[:,10:].columns).sort_values(ascending=False)
sr_fi_2 = pd.Series(models[1].feature_importances_, index=df_all_selected.iloc[:,10:].columns).sort_values(ascending=False)
sr_fi_3 = pd.Series(models[2].feature_importances_, index=df_all_selected.iloc[:,10:].columns).sort_values(ascending=False)
sr_fi_4 = pd.Series(models[3].feature_importances_, index=df_all_selected.iloc[:,10:].columns).sort_values(ascending=False)
display(sr_fi_1.head(5))
display(sr_fi_2.head(5))
display(sr_fi_3.head(5))
display(sr_fi_4.head(5))

8_預金_出金_金額    92
8_月末残高_普通     90
5_月末残高_普通     78
7_預金_出金_金額    77
0_預金_出金_金額    76
dtype: int64

8_預金_出金_金額     96
7_預金_入金_件数     80
7_月末残高_普通      78
11_預金_出金_金額    78
7_預金_出金_金額     75
dtype: int64

8_預金_出金_金額    102
7_預金_入金_件数     78
8_月末残高_普通      73
1_預金_入金_件数     69
7_預金_出金_金額     69
dtype: int64

11_預金_出金_金額    100
8_月末残高_普通       86
5_月末残高_普通       73
8_預金_出金_金額      73
10_月末残高_普通      68
dtype: int64

## 口座利用頻度別の精度

In [73]:
# データ作成
df_results_cnt = df_results.copy()
df_results_cnt['sum_cnt'] = df_results_cnt['YOKIN_PAY_COUNT_MEAN12'] + df_results_cnt['YOKIN_RECEIVE_COUNT_MEAN12']
df_results_cnt['cut_sum_cnt'] = pd.qcut(df_results_cnt['sum_cnt'], 3)

In [74]:
# データ数
cnt_seijo_count = df_results_cnt.groupby('cut_sum_cnt').apply(lambda x: len(x[x['rekka_flg']==False]))
cnt_rekka_count = df_results_cnt.groupby('cut_sum_cnt').apply(lambda x: len(x[x['rekka_flg']==True]))
cnt_count = pd.DataFrame({'seijo':cnt_seijo_count, 'rekka':cnt_rekka_count})
cnt_count.plot(kind='bar',y=['seijo','rekka'],figsize=(12,8),stacked=True,grid=True,legend=False,colors=['b','r'],rot=0)
display(cnt_count)
plt.xlabel('口座利用頻度区分（月平均）')
plt.ylabel('AUC')
plt.title('口座利用頻度区分ごとのデータ数')
plt.show()

In [75]:
# AUC
cnt_auc = df_results_cnt.groupby('cut_sum_cnt').apply(lambda x: round(roc_auc_score(x['rekka_flg'], x['rekka_score']),10))
cnt_auc.plot(kind='bar',figsize=(12,8),grid=True,legend=False,color='b',rot=0)
display(cnt_auc)
plt.xlabel('口座利用頻度区分（月平均）')
plt.ylabel('AUC')
plt.title('口座利用頻度区分ごとのAUC')
plt.show()

In [76]:
# Classification Report
cnt_cr = df_results_cnt.groupby('cut_sum_cnt').apply(
    lambda x: classification_report(x['rekka_flg'], x['rekka_score']>=calc_thres(x), digits=3))
for cr in cnt_cr:
    print(cr)

## ファイル出力

In [89]:
df_results.to_csv(PATH_OUTPUT+'results.csv')
with open('../models/model.pickle', mode='wb') as fp:
    pickle.dump(models[2], fp)
with open(PATH_OUTPUT+'X_test.pickle', mode='wb') as fp:
    pickle.dump(X_tests[2], fp)