In [None]:
!pip install catboost

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import numpy as np
from  catboost import CatBoostClassifier, Pool

In [None]:
# データの読み込み
# INPUT_DIRにtrain.csvなどのデータを置いているディレクトリを指定してください。
INPUT_DIR = "./input/"

data = pd.read_csv(INPUT_DIR + "train.csv")
test_data = pd.read_csv(INPUT_DIR + "test.csv")
sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")

y = data['TARGET']
# SK_ID_CURRを保持しておく
test_ids = test_data['SK_ID_CURR']

X = data.drop(columns=['TARGET', 'SK_ID_CURR','FLAG_MOBIL','REG_REGION_NOT_LIVE_REGION','FLAG_CONT_MOBILE','FLAG_EMAIL','FLAG_EMP_PHONE','LIVE_CITY_NOT_WORK_CITY','LIVE_REGION_NOT_WORK_REGION','REG_REGION_NOT_WORK_REGION','REG_CITY_NOT_WORK_CITY','AMT_REQ_CREDIT_BUREAU_HOUR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','CNT_CHILDREN','FLAG_PHONE'])
X_test = test_data.drop(columns=['SK_ID_CURR','FLAG_MOBIL','REG_REGION_NOT_LIVE_REGION','FLAG_CONT_MOBILE','FLAG_EMAIL','FLAG_EMP_PHONE','LIVE_CITY_NOT_WORK_CITY','LIVE_REGION_NOT_WORK_REGION','REG_REGION_NOT_WORK_REGION','REG_CITY_NOT_WORK_CITY','AMT_REQ_CREDIT_BUREAU_HOUR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','CNT_CHILDREN','FLAG_PHONE'])
all_data = pd.concat([X,X_test],axis=0)

In [None]:
# 外れ値処理
all_data.loc[data["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = 60
all_data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

In [None]:
# 特徴量作成
all_data['DAYS_BIRTH'] = all_data['DAYS_BIRTH']//-360

all_data['EXT_23_mean'] = (all_data['EXT_SOURCE_2'] + all_data['EXT_SOURCE_3']) / 2
all_data['EXT_12_mean'] = (all_data['EXT_SOURCE_1'] + all_data['EXT_SOURCE_2']) / 2
all_data['EXT_13_mean'] = (all_data['EXT_SOURCE_1'] + all_data['EXT_SOURCE_3']) / 2
all_data['EXT_123_mean'] = (all_data['EXT_SOURCE_1'] + all_data['EXT_SOURCE_2'] + all_data['EXT_SOURCE_3']) / 3

# 勤続日数/年齢日数
all_data['DAYS_EMPLOYED_PERC'] = all_data['DAYS_EMPLOYED'] / all_data['DAYS_BIRTH']
# # 総収入/借入額
all_data['INCOME_CREDIT_PERC'] = all_data['AMT_INCOME_TOTAL'] / all_data['AMT_CREDIT']
# # 総収入/家族人数
all_data['INCOME_PER_PERSON'] = all_data['AMT_INCOME_TOTAL'] / all_data['CNT_FAM_MEMBERS']
# # 月々の返済額/総収入
all_data['ANNUITY_INCOME_PERC'] = all_data['AMT_ANNUITY'] / all_data['AMT_INCOME_TOTAL']
# # 月々の返済額/借入額
all_data['PAYMENT_RATE'] = all_data['AMT_ANNUITY'] / all_data['AMT_CREDIT']


In [None]:
# カテゴリ変数のラベルエンコーディング
label_encoders = {}
obj_columns=list(all_data.select_dtypes(include=['object']).columns)
print(obj_columns)
for column in obj_columns :
    le = LabelEncoder()
    all_data[column] = le.fit_transform(all_data[column].astype(str))
    label_encoders[column] = le

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE']


In [None]:
X = all_data[:X.shape[0]]
X_test = all_data[X.shape[0]:]

# モデルのパラメータ設定
# モデルの初期化
params = {
    "iterations":2200,
    'learning_rate':0.1019777059686133,
    "depth":7,
    "eval_metric":'AUC',
    "loss_function":'MultiClass',
    'random_seed': 42,
    'verbose': 500
}

# StratifiedKFoldの設定
n_splits = 8
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# クロスバリデーションの実行
auc_scores = []
models = []

for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    model = CatBoostClassifier(**params)
    model.fit(X_train,y_train,early_stopping_rounds=200,eval_set=(X_valid, y_valid),cat_features=obj_columns)
    models.append(model)
    y_pred = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)
    print('-------------------')
    print(f'AUC: {auc}')
    auc_scores.append(auc)

# 平均AUCの出力
print(f'Mean AUC: {sum(auc_scores)/n_splits}')


# テストデータの予測
test_preds = None

for model in models:
    if test_preds is None:
        test_preds = model.predict_proba(X_test)[:, 1]
    else:
        test_preds += model.predict_proba(X_test)[:, 1]

# モデルの平均を取る
test_preds /= len(models)

# 結果を保存
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_preds})
submission.to_csv('submission.csv', index=False)