In [1]:
# # ライブラリの読み込み
# # Google Colabで実行する場合はコメントアウトを外して実行してください
# !pip install numpy==1.23.1
# !pip install pandas==1.5.3
# !pip install sklearn==1.2.2
# !pip install optuna==3.2.0
# !pip install xgboost==1.7.4

import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import optuna
import xgboost
import xgboost as xgb
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ローカルでの実行環境: M1 Macbook Air 8コアCPU 7コアGPU 16GBメモリ
print(np.__version__) # 1.23.1
print(pd.__version__) # 1.5.3
print(sklearn.__version__) # 1.2.2
print(optuna.__version__) # 3.2.0
print(xgboost.__version__) # 1.7.4

1.23.1
1.5.3
1.2.2
3.2.0
1.7.4


### データ読み込み

In [3]:
# # Google Colaboratoryで作業する場合はこちらも実行してください。
# from google.colab import drive
# drive.mount('/content/drive')
# # %cd 以降にこのnotebookを置いているディレクトリを指定してください。
# %cd "/content/drive/MyDrive/Colab Notebooks/GCI_HomeCredit"

In [4]:
# データの読み込み
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")
df = pd.concat([train, test], axis=0)
df.reset_index(drop=True)

# 結合したことでfloatになった"TARGET"をIntに変換
df["TARGET"] = df["TARGET"].astype("Int64")

# 列を省略せずに表示
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head(10)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,0,0,Cash loans,F,N,N,0,112500.0,755190.0,36328.5,675000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.010032,-9233,-878,-333.0,-522,,1,1,1,1,0,0,Core staff,2.0,2,2,0,1,1,0,1,1,School,,0.372591,,0.0,0.0,0.0,0.0,-292.0,,,,
1,1,0,Cash loans,F,N,Y,0,225000.0,585000.0,16893.0,585000.0,Unaccompanied,Pensioner,Secondary / secondary special,Married,House / apartment,0.008019,-20148,365243,-4469.0,-3436,,1,0,0,1,0,0,,2.0,2,2,0,0,0,0,0,0,XNA,,0.449567,0.553165,0.0,0.0,0.0,0.0,-617.0,0.0,0.0,0.0,1.0
2,2,0,Cash loans,F,N,Y,0,54000.0,334152.0,18256.5,270000.0,Family,State servant,Secondary / secondary special,Married,House / apartment,0.00496,-18496,-523,-3640.0,-2050,,1,1,1,1,1,0,Core staff,2.0,2,2,0,0,0,0,0,0,Postal,,0.569503,,4.0,0.0,4.0,0.0,-542.0,,,,
3,3,0,Cash loans,F,N,Y,0,67500.0,152820.0,8901.0,135000.0,Children,Pensioner,Lower secondary,Widow,House / apartment,0.005002,-24177,365243,-4950.0,-3951,,1,0,0,1,1,0,,1.0,3,3,0,0,0,0,0,0,XNA,,0.105235,0.767523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1,Cash loans,M,N,N,0,157500.0,271066.5,21546.0,234000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,With parents,0.006296,-10685,-697,-5101.0,-3226,,1,1,1,1,0,0,Drivers,2.0,3,3,0,0,0,0,1,1,Business Entity Type 3,0.342344,0.20249,0.669057,0.0,0.0,0.0,0.0,-1243.0,0.0,0.0,0.0,4.0
5,5,1,Cash loans,F,N,Y,0,135000.0,247500.0,19971.0,247500.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.018209,-16795,-425,-2796.0,-324,,1,1,0,1,0,0,Cleaning staff,2.0,3,3,0,0,0,0,0,0,Transport: type 4,,0.552871,,0.0,0.0,0.0,0.0,-307.0,,,,
6,6,0,Cash loans,F,N,Y,0,189000.0,332946.0,17127.0,238500.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.011657,-14519,-4089,-8110.0,-5499,,1,1,0,1,0,0,Cooking staff,2.0,1,1,0,1,1,0,0,0,Trade: type 3,,0.658005,0.399676,6.0,0.0,6.0,0.0,-1918.0,0.0,0.0,0.0,3.0
7,7,1,Cash loans,M,N,Y,0,157500.0,203760.0,24309.0,180000.0,Unaccompanied,Working,Higher education,Single / not married,House / apartment,0.006671,-8624,-379,-1793.0,-1192,,1,1,0,1,0,0,,1.0,2,2,0,0,0,0,0,0,Business Entity Type 3,,0.287687,,0.0,0.0,0.0,0.0,0.0,,,,
8,8,0,Cash loans,M,Y,N,1,324000.0,808650.0,22234.5,675000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.010643,-14614,-612,-5137.0,-4040,1.0,1,1,0,1,0,0,Managers,3.0,2,2,0,0,0,0,1,1,Military,,0.707878,0.629674,1.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0
9,9,1,Cash loans,F,N,Y,1,180000.0,207117.0,16492.5,171000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.020713,-17604,-5443,-455.0,-916,,1,1,0,1,0,0,Sales staff,3.0,3,3,0,0,0,1,1,0,Self-employed,0.509716,0.277001,,1.0,0.0,1.0,0.0,-905.0,,,,


In [5]:
print("trainのサイズ:", train.shape)
print("testのサイズ:", test.shape)
print("dfのサイズ:", df.shape)

trainのサイズ: (171202, 51)
testのサイズ: (61500, 50)
dfのサイズ: (232702, 51)


### 欠損値

In [6]:
# 欠損の数
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_miss = pd.concat([train.isnull().sum(), test.isnull().sum().astype('Int64'), df.isnull().sum()], axis=1)
df_miss.columns = ['Train', 'Test', "All"]
df_miss

Unnamed: 0,Train,Test,All
SK_ID_CURR,0,0.0,0
TARGET,0,,61500
NAME_CONTRACT_TYPE,0,0.0,0
CODE_GENDER,0,0.0,0
FLAG_OWN_CAR,34241,12334.0,46575
FLAG_OWN_REALTY,34241,12334.0,46575
CNT_CHILDREN,0,0.0,0
AMT_INCOME_TOTAL,0,0.0,0
AMT_CREDIT,0,0.0,0
AMT_ANNUITY,6,3.0,9


### 特徴量エンジニアリング・欠損値補完
- 参考: https://www.kaggle.com/c/home-credit-default-risk/discussion/64821
- 引用1: https://www.kaggle.com/code/hikmetsezen/base-model-with-0-804-auc-on-home-credit
- 引用2: https://github.com/keiju1120/GCI2020-Winter/blob/master/Competition2/competition2-sub.ipynb
- 引用3: https://github.com/Hirochon/GCI2020-Summer/blob/master/Competition2/3rd_place_solution.ipynb


In [7]:
def create_new_features(df):

    # --オリジナル--
    # 'AMT_INCOME_TOTAL' が 20000000 以上の場合、欠損値 (NaN) に置き換える
    df.loc[df['AMT_INCOME_TOTAL'] >= 20000000, 'AMT_INCOME_TOTAL'] = np.nan
    # annuity loan（定期的に支払うローン金額）に対して何倍の与信額が与えられているのか
    df["Credit_Annuity_Ratio"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"]
    # 融資対象となる商品の値段に対して何倍の与信額が与えられているのか
    df["Credit_Goods_Ratio"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
    # 申し込み時点での年齢
    df["Age"] = -df["DAYS_BIRTH"] / 365
    # 生まれた日と雇われた日の倍率
    df['Days_Employed_Birth_Ratio'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    # 与信額に対する収入額の倍率
    df["Income_Annuity_Ratio"] = df["AMT_INCOME_TOTAL"] / df["AMT_ANNUITY"]
    # 与信額に対する収入の倍率
    df["Income_Credit_Ratio"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"]
    # 家族の人数と収入
    df["Income_Per_Person"] = df["AMT_INCOME_TOTAL"] / df["CNT_FAM_MEMBERS"]
    # Ageのビニング(最小値20, 最大70の間で11分割) →若いほど債券不履行になりやすい / ラベルエンコーディング
    # df['Age_Binning'] = pd.cut(df['Age'], bins = np.linspace(20, 70, num = 11))
    df['Age_Binning'] = pd.cut(df['Age'], [27, 40, 50, 65, 99])
    df['Age_Bin_Code'] = LabelEncoder().fit_transform(df['Age_Binning'])
    # 収入のビニング/ラベルエンコーディング
    df['Income_Binning'] = pd.cut(df['AMT_INCOME_TOTAL'], [0, 1e+05, 3e+05, 5e+05, 1e+06, 2e+08])
    df['Income_Bin_Code'] = LabelEncoder().fit_transform(df['Income_Binning'])
    # --オリジナル終わり--

    # --引用1--
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True) # set null value
    # 各種収入の割合
    df['Income_Employed_Ratio'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['Income_Birth_Ratio'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']
    # 各種Daysに関する割合
    df['Id_Birth_Ratio'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH']
    df['Car_Birth_Ratio'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['Car_Employed_Ratio'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['Phone_Birth_Ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    # 外部データの積
    df["Ext_Sources_Prod"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
    # 外部データの重みづけ積
    df["Ext_Weighted_Prod"] = df["EXT_SOURCE_1"]*2 + df["EXT_SOURCE_2"]*1 + df["EXT_SOURCE_3"]*3
    # --引用1終わり--

    #　--引用2--
    # 外部データの最小値、最大値、平均、中央値、分散を計算し、新たな特徴量を作成
    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'Ext_Sources_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
    # --引用2終わり--

    # --引用3--
    df['EXT_23_mean'] = (df['EXT_SOURCE_2'] + df['EXT_SOURCE_3']) / 2
    df['EXT_12_mean'] = (df['EXT_SOURCE_1'] + df['EXT_SOURCE_2']) / 2
    df['EXT_13_mean'] = (df['EXT_SOURCE_1'] + df['EXT_SOURCE_3']) / 2
    df['EXT_23_sabun'] = abs(df['EXT_SOURCE_2'] - df['EXT_SOURCE_3'])
    df['EXT_12_sabun'] = abs(df['EXT_SOURCE_1'] - df['EXT_SOURCE_2'])
    df['EXT_13_sabun'] = abs(df['EXT_SOURCE_1'] - df['EXT_SOURCE_3'])
    # --引用3終わり--
    return df

### LabelEncoder関数

In [8]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

### 使わないカラムを指定する関数

In [9]:
def drop_columns(df):
    drop_list = [
    "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "CNT_CHILDREN", "NAME_TYPE_SUITE", "NAME_HOUSING_TYPE",
    "FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_CONT_MOBILE", "FLAG_PHONE", "FLAG_EMAIL", "REGION_RATING_CLIENT",
    "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", "REG_CITY_NOT_LIVE_CITY",
    "REG_CITY_NOT_WORK_CITY", "LIVE_CITY_NOT_WORK_CITY", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_MON",
    "Age_Binning", "Income_Binning", "Income_Bin_Code"
        ]
    droped_df = df.drop(columns=drop_list)
    return droped_df

### データ形成

In [10]:
added_features_df = create_new_features(df)
dropped_df = drop_columns(added_features_df)
label_encoded_df, encoded_columns = label_encoder(dropped_df, None)
df = label_encoded_df
train = df[df.loc[:, 'SK_ID_CURR'] < 171202]
test = df[df.loc[:, 'SK_ID_CURR'] > 171201]
train_x = train.drop(columns=["TARGET", "SK_ID_CURR"])
train_y = train["TARGET"]
test_x = test.drop(columns=["TARGET", "SK_ID_CURR"])
X_values = train_x.values
y_values = train_y.values
y_values = y_values.astype(int)

df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,Credit_Annuity_Ratio,Credit_Goods_Ratio,Age,Days_Employed_Birth_Ratio,Income_Annuity_Ratio,Income_Credit_Ratio,Income_Per_Person,Age_Bin_Code,Income_Employed_Ratio,Income_Birth_Ratio,Id_Birth_Ratio,Car_Birth_Ratio,Car_Employed_Ratio,Phone_Birth_Ratio,Ext_Sources_Prod,Ext_Weighted_Prod,Ext_Sources_MIN,Ext_Sources_MAX,Ext_Sources_MEAN,Ext_Sources_NANMEDIAN,Ext_Sources_VAR,EXT_23_mean,EXT_12_mean,EXT_13_mean,EXT_23_sabun,EXT_12_sabun,EXT_13_sabun
0,0,0,0,0,112500.0,755190.0,36328.5,675000.0,0,0,0,0.010032,-9233,-878,-333.0,-522,,1,0,2.0,2,0,,0.372591,,0.0,0.0,0.0,0.0,-292.0,,,20.787811,1.1188,25.29589,0.095094,3.096742,0.148969,56250.0,4,-128.132118,-12.184555,0.056536,,,0.031626,,,0.372591,0.372591,0.372591,0.372591,0.0,,,,,,
1,1,0,0,0,225000.0,585000.0,16893.0,585000.0,1,1,0,0.008019,-20148,365243,-4469.0,-3436,,0,-1,2.0,2,1,,0.449567,0.553165,0.0,0.0,0.0,0.0,-617.0,0.0,1.0,34.629728,1.0,55.2,-18.128003,13.319126,0.384615,112500.0,2,0.616028,-11.167362,0.170538,,,0.030623,,,0.449567,0.553165,0.501366,0.501366,0.002683,0.501366,,,0.103598,,
2,2,0,0,0,54000.0,334152.0,18256.5,270000.0,2,1,0,0.00496,-18496,-523,-3640.0,-2050,,1,0,2.0,2,2,,0.569503,,4.0,0.0,4.0,0.0,-542.0,,,18.30318,1.2376,50.673973,0.028276,2.957851,0.161603,27000.0,2,-103.250478,-2.91955,0.110835,,,0.029304,,,0.569503,0.569503,0.569503,0.569503,0.0,,,,,,
3,3,0,0,0,67500.0,152820.0,8901.0,135000.0,1,2,1,0.005002,-24177,365243,-4950.0,-3951,,0,-1,1.0,3,1,,0.105235,0.767523,0.0,0.0,0.0,0.0,,0.0,0.0,17.168857,1.132,66.238356,-15.107044,7.583418,0.441696,67500.0,3,0.184808,-2.79191,0.16342,,,,,,0.105235,0.767523,0.436379,0.436379,0.109657,0.436379,,,0.662289,,
4,4,1,0,1,157500.0,271066.5,21546.0,234000.0,3,1,0,0.006296,-10685,-697,-5101.0,-3226,,1,1,2.0,3,3,0.342344,0.20249,0.669057,0.0,0.0,0.0,0.0,-1243.0,0.0,4.0,12.580827,1.158404,29.273973,0.065232,7.309942,0.581038,78750.0,0,-225.968436,-14.74029,0.301919,,,0.116331,0.04638,2.894348,0.20249,0.669057,0.40463,0.342344,0.038221,0.435773,0.272417,0.5057,0.466567,0.139854,0.326713


In [11]:
# 欠損の数
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_miss = pd.concat([train.isnull().sum(), test.isnull().sum().astype('Int64'), df.isnull().sum()], axis=1)
df_miss.columns = ['Train', 'Test', "All"]
df_miss

Unnamed: 0,Train,Test,All
SK_ID_CURR,0,0,0
TARGET,0,61500,61500
NAME_CONTRACT_TYPE,0,0,0
CODE_GENDER,0,0,0
AMT_INCOME_TOTAL,0,1,1
AMT_CREDIT,0,0,0
AMT_ANNUITY,6,3,9
AMT_GOODS_PRICE,163,53,216
NAME_INCOME_TYPE,0,0,0
NAME_EDUCATION_TYPE,0,0,0


In [12]:
print("trainのサイズ:", train.shape)
print("testのサイズ:", test.shape)
print("dfのサイズ:", df.shape)

trainのサイズ: (171202, 59)
testのサイズ: (61500, 59)
dfのサイズ: (232702, 59)


### 特徴量重要度

In [13]:
# 特徴量表示
# 下記コードを実行する際は、特徴量を一度全て使用して特徴量の重要度を確認してください。

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
df_test_score = pd.DataFrame()
df_train_score = pd.DataFrame()
# 学習させた訓練モデルを格納する辞書。ここではデータを5分割するから一つのモデル名に対して5つの学習モデルが生成され、格納される。
dict_kfold_trained_models = {}
list_models = [XGBClassifier()]
for model in list_models:
    # modelの名前をstrとして変換し、printする
    model_name = type(model).__name__
    for i, (train_index, test_index) in enumerate(fold.split(X_values, y_values)):
        X_train, y_train = X_values[train_index], y_values[train_index]
        X_valid, y_valid = X_values[test_index], y_values[test_index]
        model.fit(X_train, y_train)
        # 学習済みモデルを辞書に格納
        # もしmodel_nameをキーとしてそのモデルが存在していなかったらmodel_nameをキー、fitさせたmodelを値として追加する。
        if model_name not in dict_kfold_trained_models:
            dict_kfold_trained_models[model_name] = [model]
        # もしすでにmodel_nameをキーとしてそのモデルが存在していたら、model_nameをキー、fitさせたmodelを値として追加する。
        # キー = model_nameに対して値 = リストとすることで、一つのmodel_nameに対して複数のmodel(n_splits個数分)を格納できる
        else:
            dict_kfold_trained_models[model_name].append(model)
        test_score = model.score(X_valid, y_valid)
        train_score = model.score(X_train, y_train)
        df_test_score.at[i, model_name] = test_score
        df_train_score.at[i, model_name] = train_score
        # print(f"test score:{test_score:.3f}\t train score:{train_score:.3f}")
# 特徴量の重要度をdataframeにまとめる
# Feature importanceを格納するDataFrame
df_feature_importance = pd.DataFrame(index=train_x.columns)

for index, (model_name, models) in enumerate(dict_kfold_trained_models.items()):
        model = models[i]
        if model_name in ['XGBClassifier']:
            try:
                # 特徴量の重要度を取得
                feature_importance = model.feature_importances_
            except AttributeError:
            # LightGBMの場合は、feature_importances_の代わりにfeature_importance()を使う
                feature_importance = model.feature_importance(importance_type='gain')
            # DataFrameに格納
            df_feature_importance[model_name] = feature_importance

# 特徴量の高い順に表示
df_feature_importance.sort_values(by="XGBClassifier", ascending=False)

Unnamed: 0,XGBClassifier
Ext_Sources_MEAN,0.162073
Age_Bin_Code,0.034425
Ext_Sources_NANMEDIAN,0.03242
NAME_EDUCATION_TYPE,0.025482
NAME_CONTRACT_TYPE,0.02468
CODE_GENDER,0.024399
Credit_Annuity_Ratio,0.023293
REGION_RATING_CLIENT_W_CITY,0.021839
EXT_23_mean,0.021698
Credit_Goods_Ratio,0.020039


### 交差検証・Optunaパラメタチューニング

In [14]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = list(fold.split(X_values, y_values))

In [15]:
def fit_xgb(X, y, cv, params: dict=None, verbose=100):
    # out of foldでの予測値を保存するための配列
    oof_preds = np.zeros(X.shape[0])
    # if params is None:
    #     params = {'tree_method': 'gpu_hist'}  # <-- GPUサポートを追加 （Optunaでのパラメータ探索のためGoogle ColabのGPUを使う時に使用）
    # else:
    #     params['tree_method'] = 'gpu_hist'  # <-- GPUサポートを追加（Optunaでのパラメータ探索のためGoogle ColabのGPUを使う時に使用）
    if params is None:
        params = {}

    models = []

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = XGBClassifier(**params, random_state=0, n_estimators=10000)
        clf.fit(x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                early_stopping_rounds=100,
                eval_metric='auc',
                verbose=verbose)

        models.append(clf)
        oof_preds[idx_valid] = clf.predict_proba(x_valid)[:, 1]
        print('Fold %2d AUC: %.6f' % (i + 1, roc_auc_score(y_valid, oof_preds[idx_valid])))

    score = roc_auc_score(y, oof_preds)
    print('Final AUC score: %.6f' % score)
    return oof_preds, models

In [16]:
# Optunaによるパラメータ探索
def objective_xgb(trial):
    params ={
        # 'tree_method': 'gpu_hist',  # <-- GPUサポートを追加 （Optunaでのパラメータ探索のためGoogle ColabのGPUを使う時に使用）
        'max_depth':trial.suggest_int("max_depth",3,7),
        'min_child_weight':trial.suggest_int('min_child_weight',1,5),
        'gamma':trial.suggest_uniform('gamma',0,1),
        'subsample':trial.suggest_uniform('subsample',0.4,1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.1,0.8),
        'reg_alpha':trial.suggest_loguniform('reg_alpha',1,5),
        'reg_lambda':trial.suggest_loguniform('reg_lambda',1e-2,0.1),
        'learning_rate':trial.suggest_uniform('learning_rate',0.01,0.03)
        }
    auc_scores = []
    for i, (idx_train, idx_valid) in enumerate(cv):
        X_train, y_train = X_values[idx_train], y_values[idx_train]
        X_valid, y_valid = X_values[idx_valid], y_values[idx_valid]

        clf = XGBClassifier(**params, use_label_encoder=False, n_estimators=10000)
        clf.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                eval_metric='auc',
                verbose=False)
        y_valid_pred = clf.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_valid_pred)
        auc_scores.append(auc)
    return np.mean(auc_scores)


In [17]:
# Optunaによるパラメータチューニングによって探索したパラメータ
params1 = {'max_depth': 7, 'min_child_weight': 4, 'gamma': 0.22829762987848945, 'subsample': 0.8517472109774532, 'colsample_bytree': 0.20586154299632436, 'reg_alpha': 1.4817187990395, 'reg_lambda': 0.05840996799805297, 'learning_rate': 0.011946775847112156}

# # Optunaを使ってパラメータチューニングするときはこちらを使う
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_xgb, n_trials=5)
# optuna_best_params = study.best_params

oof, models = fit_xgb(X_values, y_values, cv=cv, params = params1)

[0]	validation_0-auc:0.61232
[100]	validation_0-auc:0.74417
[200]	validation_0-auc:0.74761
[300]	validation_0-auc:0.75163
[400]	validation_0-auc:0.75596
[500]	validation_0-auc:0.75936
[600]	validation_0-auc:0.76163
[700]	validation_0-auc:0.76340
[800]	validation_0-auc:0.76472
[900]	validation_0-auc:0.76512
[1000]	validation_0-auc:0.76544
[1100]	validation_0-auc:0.76586
[1200]	validation_0-auc:0.76590
[1300]	validation_0-auc:0.76612
[1400]	validation_0-auc:0.76619
[1475]	validation_0-auc:0.76603
Fold  1 AUC: 0.766315
[0]	validation_0-auc:0.60566
[100]	validation_0-auc:0.73947
[200]	validation_0-auc:0.74364
[300]	validation_0-auc:0.74792
[400]	validation_0-auc:0.75216
[500]	validation_0-auc:0.75599
[600]	validation_0-auc:0.75860
[700]	validation_0-auc:0.76053
[800]	validation_0-auc:0.76203
[900]	validation_0-auc:0.76296
[1000]	validation_0-auc:0.76361
[1100]	validation_0-auc:0.76451
[1200]	validation_0-auc:0.76493
[1300]	validation_0-auc:0.76524
[1400]	validation_0-auc:0.76539
[1500]	val

In [18]:
# # Optunaによる最適パラメータ探索の結果を表示
# print(study.best_params)

### CSVファイルへの出力

In [19]:
pred = np.array([model.predict_proba(test_x.values)[:, 1] for model in models])
pred = np.mean(pred, axis=0)
submission['TARGET'] = pred
submission.to_csv('xgboost_5fold.csv', index=False)