In [1]:
#貸し倒れの有無を予測する2値分類タスク
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import re
import pickle
import gc

from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import StratifiedKFold
#AUC：2値分類タスクの評価指標。1に近いほど精度が高く、0.5だとバラバラ
from sklearn.metrics import roc_auc_score#AUCスコア計算用

import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [3]:
#ファイルの読み込み、データの確認
application_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#各カラムのデータ型を最適化し、メモリ使用量を削減する関数
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of after optimization is {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100* (start_mem - end_mem) / start_mem))

    return df

In [5]:
#上記関数を実行
application_train = reduce_mem_usage(application_train)

Memory usage of dataframe is 286.23 MB
Memory usage of after optimization is 92.38 MB
Decreased by 67.7%


In [6]:
#データセットの作成（目的変数は"TARGET"）
x_train = application_train.drop(columns=["TARGET","SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]#予測値にIDを付与するため

#カテゴリ変数をcategory型に変換する
for col in x_train.columns:
    if x_train[col].dtype=="object":
        x_train[col] = x_train[col].astype("category")

In [7]:
#バリテーション設計（今回はクロスバリデーション）

#1の割合、それぞれの件数を確認
print("mean:{:.4f}".format(y_train.mean()))
y_train.value_counts()

mean:0.0807


TARGET
0    282686
1     24825
Name: count, dtype: int64

In [8]:
#層化分割したバリデーションのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train,y_train))

#indexの確認
print("train:",cv[0][0])#fold=0の学習データ
print("valid:",cv[0][1])#fold=1の検証データ

train: [     0      1      3 ... 307508 307509 307510]
valid: [     2     11     22 ... 307488 307495 307497]


In [9]:
#0fold目のindexのリスト取得
nfold = 0
idx_tr, idx_va = cv[nfold][0],cv[nfold][1]

#学習データと検証データに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr,:], y_train[idx_tr], id_train.loc[idx_tr,:]
x_va, y_va, id_va = x_train.loc[idx_va,:], y_train[idx_va], id_train.loc[idx_va,:]
print(x_tr.shape,y_train.shape,id_train.shape)
print(x_va.shape,y_va.shape,id_va.shape)

(246008, 120) (307511,) (307511, 1)
(61503, 120) (61503,) (61503, 1)


In [10]:
#モデル学習（lightGBM使用）
from lightgbm import early_stopping, log_evaluation

params = {
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'auc',
    'learning_rate':0.05,
    'num_leaves':32,
    'n_estimators':100000,
    "random_state":123,
    "importance_type":"gain",
}

#モデルの学習
model = lgb.LGBMClassifier(**params)
verbose_eval = 100
model.fit(x_tr,
         y_tr,
         eval_set=[(x_tr,y_tr),(x_va,y_va)],
         callbacks=[
        early_stopping(stopping_rounds=100),  # 早期停止のコールバック
        log_evaluation(verbose_eval)]          # ログ表示のコールバック
         )

#モデルの保存
with open("model_lgb_fold0.pickle","wb") as f:
    pickle.dump(model,f,protocol=4)

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11367
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
Early stopping, best iteration is:
[217]	training's auc: 0.812578	valid_1's auc: 0.758595


In [11]:
#モデル評価
#学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr,y_tr_pred)

#検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va,y_va_pred)

#評価値を入れる変数の作成
metrics = []

#評価値を格納
metrics.append([nfold,metric_tr,metric_va])

#結果を出力
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr,metric_va))

[auc] tr:0.8126, va:0.7586


In [12]:
#OOFの推論値を入れる変数の作成
train_oof = np.zeros(len(x_train))

#検証データのindexに推論値を格納
train_oof[idx_va] = y_va_pred

In [13]:
#説明変数の重要度の取得（foldごと）
imp_fold = pd.DataFrame({"col":x_train.columns,"imp":model.feature_importances_,"nfold":nfold})
#確認(重要度の上位10個)
display(imp_fold.sort_values("imp",ascending=False)[:10])

#重要度を格納する5-fold用データフレームの作成
imp = pd.DataFrame()
#imp_foldを5fold用データフレームに結合
imp = pd.concat([imp,imp_fold])

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,66225.020483,0
40,EXT_SOURCE_2,52568.833805,0
38,ORGANIZATION_TYPE,20218.523523,0
39,EXT_SOURCE_1,19776.252288,0
6,AMT_CREDIT,8111.321247,0
8,AMT_GOODS_PRICE,7120.960365,0
15,DAYS_BIRTH,7042.223005,0
7,AMT_ANNUITY,6992.551795,0
16,DAYS_EMPLOYED,5236.51412,0
26,OCCUPATION_TYPE,4376.651746,0


In [14]:
#モデル評価（全foldのサマリ）
#リスト型をarray型に変換
metrics = np.array(metrics)
print(metrics)

#学習/検証データの評価値の平均と標準偏差を算出
print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(metrics[:,1].mean(),metrics[:,1].std(),
                                                        metrics[:,2].mean(),metrics[:,2].std()))
#oofの評価値を算出
print("[oof]{:.4f}".format(roc_auc_score(y_train,train_oof)))

[[0.         0.81257796 0.75859528]]
[cv] tr:0.8126+-0.0000, va:0.7586+-0.0000
[oof]0.5103


In [15]:
#oofデータの推論値取得
train_oof = pd.concat([id_train,pd.DataFrame({"true":y_train,"pred":train_oof}),],axis=1)
train_oof.head()

#説明変数の重要度取得（全foldのサマリ）
imp = imp.groupby("col")["imp"].agg(["mean","std"]).reset_index(drop=False)
imp.columns = ["col","imp","imp_std"]
imp.head()

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,6992.551795,
1,AMT_CREDIT,8111.321247,
2,AMT_GOODS_PRICE,7120.960365,
3,AMT_INCOME_TOTAL,1595.740609,
4,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,


In [16]:
#ここまでの処理の流れ（ベースライン）を関数化
def train_lgb(input_x,
              input_y,
              input_id,
             params,
             list_nfold=[0,1,2,3,4],
             n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    #cross-valitation
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(input_x,input_y))
    for nfold in list_nfold:
        print("-"*20,nfold,"-"*20)

        #make dataset
        idx_tr, idx_va = cv[nfold][0],cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr,:], input_y[idx_tr], input_id.loc[idx_tr,:]
        x_va, y_va, id_va = input_x.loc[idx_va,:], input_y[idx_va], input_id.loc[idx_va,:]
        print(x_tr.shape,x_va.shape)

        #train
        model = lgb.LGBMClassifier(**params)
        verbose_eval = 100
        model.fit(x_tr,
                 y_tr,
                 eval_set=[(x_tr,y_tr),(x_va,y_va)],
                 callbacks=[
                early_stopping(stopping_rounds=100),  # 早期停止のコールバック
                log_evaluation(verbose_eval)]          # ログ表示のコールバック
                 )

        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        #モデルの保存
        with open(fname_lgb,"wb") as f:
            pickle.dump(model,f,protocol=4)

        #evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr,y_tr_pred)
        metric_va = roc_auc_score(y_va,y_va_pred)
        metrics.append([nfold,metric_tr,metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr,metric_va))

        #oof
        train_oof[idx_va] = y_va_pred

        #imp
        imp_fold = pd.DataFrame({"col":input_x.columns,"imp":model.feature_importances_,"nfold":nfold})
        imp = pd.concat([imp,imp_fold])

    print("-"*20,"result","-"*20)
    #metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(metrics[:,1].mean(),metrics[:,1].std(),
                                                            metrics[:,2].mean(),metrics[:,2].std()))
    print("[oof]{:.4f}".format(roc_auc_score(input_y,train_oof)))

    #oof
    train_oof = pd.concat([input_id,pd.DataFrame({"pred":train_oof})],axis=1)

    #importance
    imp = imp.groupby("col")["imp"].agg(["mean","std"]).reset_index(drop=False)
    imp.columns = ["col","imp","imp_std"]

    return train_oof, imp, metrics

In [17]:
#関数を用いて学習処理を実行
params = {
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'auc',
    'learning_rate':0.05,
    'num_leaves':32,
    'n_estimators':100000,
    "random_state":123,
    "importance_type":"gain",
}

train_oof,imp,metrics = train_lgb(x_train,
                                 y_train,
                                 id_train,
                                 params,
                                 list_nfold=[0,1,2,3,4],
                                 n_splits=5,
                                 )

-------------------- 0 --------------------
(246008, 120) (61503, 120)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.186603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11367
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
Early stopping, best iteration is:
[217]	training's auc: 0.812578	valid_1's auc: 0.758595
[auc] tr:0.8126, va:0.7586
-------------------- 1 

In [18]:
#説明変数の重要度確認
imp.sort_values("imp",ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
38,EXT_SOURCE_3,65353.907478,1558.201212
37,EXT_SOURCE_2,54545.388309,1251.798934
102,ORGANIZATION_TYPE,21441.917474,1450.24619
36,EXT_SOURCE_1,20051.934248,685.852224
1,AMT_CREDIT,8263.228728,410.384434
22,DAYS_BIRTH,7645.58911,689.458833
2,AMT_GOODS_PRICE,7263.054566,405.837031
0,AMT_ANNUITY,6762.95364,479.302045
23,DAYS_EMPLOYED,5810.288375,552.93773
101,OCCUPATION_TYPE,5502.675859,831.872392


In [20]:
#モデル推論

#ファイルの読み込み
application_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")
application_test = reduce_mem_usage(application_test)

#データセットの作成
x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]

#カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="object":
        x_test[col] = x_test[col].astype("category")

Memory usage of dataframe is 45.00 MB
Memory usage of after optimization is 14.60 MB
Decreased by 67.6%


In [21]:
#推論用の関数を定義
def predict_lgb(input_x,
               input_id,
               list_nfold=[0,1,2,3,4],
               ):
    #推論値を格納する変数を作成
    pred = np.zeros((len(input_x),len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20,nfold,"-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb,"rb") as f:#学習済みモデルの読み込み
            model = pickle.load(f)
        pred[:,nfold] = model.predict_proba(input_x)[:,1]

    #推論用データセットを作成
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred":pred.mean(axis=1)}),#推論値の平均値を計算
    ],axis=1)

    print("Done.")

    return pred

In [22]:
#関数を使って推論処理の実行
test_pred = predict_lgb(x_test,
                       id_test,
                       list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [23]:
#提出ファイルの作成
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())

#ファイル出力
df_submit.to_csv("submission_baseline.csv",index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.04181
1,100005,0.1264
2,100013,0.022495
3,100028,0.03968
4,100038,0.156628


In [None]:
#ここから特徴量エンジニアリング
