<a href="https://colab.research.google.com/github/nahyunryou/ML_Project/blob/main/Catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import warnings
warnings.filterwarnings('ignore')

import glob
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
import random

import optuna
from optuna.samplers import TPESampler

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Data Load & Preprocessing
  * 훈련에 필요없는 index 컬럼 삭제.
  * missing value를 모두 NAN 문자열로 대체
  * 범주형 특성 중 일부를 Ordinal하게 바꿈
  * 동일 인물이 시간순서에 따라 Credit이 바뀐다고 가정하였으므로 동일인물을 구분할 주민등록번호 같은 고유코드를 만듬

In [18]:
train = pd.read_csv('/content/drive/MyDrive/ml_project/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN',inplace=True)

test = pd.read_csv('/content/drive/MyDrive/ml_project/test.csv')
test = test.drop(['index'],axis=1)
test.fillna('NAN',inplace=True)

submit = pd.read_csv('/content/drive/MyDrive/ml_project/sample_submission.csv')

In [19]:
edu_dict = {'Lower secondary':1,
 'Secondary / secondary special':2,
 'Incomplete higher':3,
 'Higher education':4,
 'Academic degree':5}

train['edu_type'].replace(edu_dict,inplace=True)
test['edu_type'].replace(edu_dict,inplace=True)

In [20]:
house_dict = {'With parents':3,
 'Co-op apartment':2,
 'Municipal apartment':1,
 'Rented apartment':4,
 'Office apartment':5,
 'House / apartment':6}

train['house_type'].replace(house_dict,inplace=True)
test['house_type'].replace(house_dict,inplace=True)

In [21]:
train['CODE'] = train['gender']+train['DAYS_BIRTH'].apply(str)+train['income_total'].apply(str)+train['income_type'].apply(str)
test['CODE'] = test['gender']+test['DAYS_BIRTH'].apply(str)+test['income_total'].apply(str)+test['income_type'].apply(str)

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  object 
 1   car            26457 non-null  object 
 2   reality        26457 non-null  object 
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  int64  
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  int64  
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  FLAG_MOBIL     26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     26457 non-null  object 
 16  family_size    26457 non-null  float64
 17  begin_month    26457 non-null  float64
 18  credit

#Training
  * Optuna를 사용하여 best parameter 추출(Optuna를 K-Fold에 바로 적용하고 싶었으나 Colab에서 메모리 한계로 학습이 진행되지 않아 일반 데이터셋을 split하여 best parameter만 추출한 후 그것을 통해 Catboost 모델을 재학습함)
  * 데이터 분리는 StratifiedKFold를 사용하여 y값 분포를 비슷하게 분리시킴 -> 5-fold
  * CatBoostClasffier 사용(Optuna에서 추출한 best parameter 사용)
  * 35번 이상 개선 없을 경우 조기종료
  * 각 5개의 fold를 훈련하여 저장

In [23]:
#Optuna용 Train셋
X = train.drop(['credit'],axis=1)
y = train['credit']
X_test = test.copy()

In [24]:
def objective(trial):
  param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
  }

  X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)
  
  cat_features =[0,1,2,5,6,7,8,15,18]
  cat = CatBoostClassifier(**param)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_pred = cat.predict_proba(X_valid)
  log_score = log_loss(y_valid, cat_pred)

  return log_score

In [25]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-11-30 10:15:58,901][0m A new study created in memory with name: cat_parameter_opt[0m


0:	learn: 1.0828369	test: 1.0828369	test1: 1.0830686	best: 1.0830686 (0)	total: 58ms	remaining: 7m 20s
100:	learn: 0.8813354	test: 0.8813354	test1: 0.8875935	best: 0.8875935 (100)	total: 820ms	remaining: 1m


[32m[I 2022-11-30 10:16:00,331][0m Trial 0 finished with value: 0.8875591245663368 and parameters: {'learning_rate': 0.03574712922600244, 'bagging_temperature': 63.512210106407046, 'n_estimators': 7588, 'max_depth': 11, 'random_strength': 15, 'colsample_bylevel': 0.49359671220172163, 'l2_leaf_reg': 1.7519275289243016e-06, 'min_child_samples': 88, 'max_bin': 380, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.8875591245663368.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.8875591246
bestIteration = 116

Shrink model to first 117 iterations.
0:	learn: 0.9741562	test: 0.9741562	test1: 0.9735132	best: 0.9735132 (0)	total: 42.6ms	remaining: 2m 3s
100:	learn: 0.7225655	test: 0.7393259	test1: 0.7978773	best: 0.7977820 (98)	total: 4.9s	remaining: 2m 16s
200:	learn: 0.6427462	test: 0.6788736	test1: 0.7794359	best: 0.7792192 (197)	total: 9.57s	remaining: 2m 9s
300:	learn: 0.5810492	test: 0.6305959	test1: 0.7690978	best: 0.7688184 (294)	total: 14s	remaining: 2m 1s
400:	learn: 0.5023314	test: 0.4527164	test1: 0.7139884	best: 0.7130368 (382)	total: 18.1s	remaining: 1m 53s


[32m[I 2022-11-30 10:16:19,383][0m Trial 1 finished with value: 0.7130368254561391 and parameters: {'learning_rate': 0.2708160864249968, 'bagging_temperature': 21.368329072358772, 'n_estimators': 2911, 'max_depth': 6, 'random_strength': 18, 'colsample_bylevel': 0.5825453457757226, 'l2_leaf_reg': 1.5747445384650815e-05, 'min_child_samples': 46, 'max_bin': 287, 'od_type': 'IncToDec'}. Best is trial 1 with value: 0.7130368254561391.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7130368255
bestIteration = 382

Shrink model to first 383 iterations.
0:	learn: 1.0867188	test: 1.0867529	test1: 1.0864894	best: 1.0864894 (0)	total: 50.2ms	remaining: 4m 16s
100:	learn: 0.8050453	test: 0.7102993	test1: 0.7780996	best: 0.7780996 (100)	total: 11.6s	remaining: 9m 34s
200:	learn: 0.7572037	test: 0.5917983	test1: 0.7215075	best: 0.7215075 (200)	total: 25.8s	remaining: 10m 29s
300:	learn: 0.7362899	test: 0.5482681	test1: 0.7047416	best: 0.7047416 (300)	total: 46.5s	remaining: 12m 22s
400:	learn: 0.7152266	test: 0.5160743	test1: 0.6901818	best: 0.6901818 (400)	total: 1m 9s	remaining: 13m 35s
500:	learn: 0.6109001	test: 0.4786337	test1: 0.6759109	best: 0.6755877 (496)	total: 2m 9s	remaining: 19m 49s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6747371939
bestIteration = 528

Shrink model to first 529 iterations.


[32m[I 2022-11-30 10:19:33,551][0m Trial 2 finished with value: 0.6747371939040755 and parameters: {'learning_rate': 0.027010527749605478, 'bagging_temperature': 0.2920433847181412, 'n_estimators': 5105, 'max_depth': 14, 'random_strength': 20, 'colsample_bylevel': 0.708540663048167, 'l2_leaf_reg': 1.7776512920172654e-05, 'min_child_samples': 9, 'max_bin': 382, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6747371939040755.[0m


0:	learn: 0.9992323	test: 0.9992323	test1: 1.0030961	best: 1.0030961 (0)	total: 10.7ms	remaining: 1m 28s


[32m[I 2022-11-30 10:19:34,161][0m Trial 3 finished with value: 0.8931389702534419 and parameters: {'learning_rate': 0.2521267904777921, 'bagging_temperature': 72.86653737491046, 'n_estimators': 8276, 'max_depth': 7, 'random_strength': 9, 'colsample_bylevel': 0.8105398159072941, 'l2_leaf_reg': 1.3210173287250643e-05, 'min_child_samples': 16, 'max_bin': 349, 'od_type': 'Iter'}. Best is trial 2 with value: 0.6747371939040755.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.8931389703
bestIteration = 12

Shrink model to first 13 iterations.
0:	learn: 1.0877374	test: 1.0878935	test1: 1.0877731	best: 1.0877731 (0)	total: 71ms	remaining: 4m 30s
100:	learn: 0.7667518	test: 0.6476960	test1: 0.7438608	best: 0.7438608 (100)	total: 6.77s	remaining: 4m 8s
200:	learn: 0.7210036	test: 0.5603584	test1: 0.7050759	best: 0.7050759 (200)	total: 14.2s	remaining: 4m 14s
300:	learn: 0.6926311	test: 0.5193043	test1: 0.6925235	best: 0.6925235 (300)	total: 22.8s	remaining: 4m 24s
400:	learn: 0.6658337	test: 0.4982845	test1: 0.6873335	best: 0.6873225 (395)	total: 30.8s	remaining: 4m 21s
500:	learn: 0.6087089	test: 0.4772132	test1: 0.6847708	best: 0.6846024 (477)	total: 41.8s	remaining: 4m 35s


[32m[I 2022-11-30 10:20:22,835][0m Trial 4 finished with value: 0.6844135834612116 and parameters: {'learning_rate': 0.024112898115291985, 'bagging_temperature': 4.467752817973908, 'n_estimators': 3805, 'max_depth': 10, 'random_strength': 55, 'colsample_bylevel': 0.5109126733153162, 'l2_leaf_reg': 2.9087842986659113e-05, 'min_child_samples': 79, 'max_bin': 482, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6747371939040755.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6844135835
bestIteration = 525

Shrink model to first 526 iterations.
0:	learn: 1.0068801	test: 1.0068801	test1: 1.0095365	best: 1.0095365 (0)	total: 16.2ms	remaining: 44.6s
100:	learn: 0.7148489	test: 0.4996758	test1: 0.6866187	best: 0.6755712 (76)	total: 2.47s	remaining: 1m 5s


[32m[I 2022-11-30 10:20:25,862][0m Trial 5 finished with value: 0.6755711975928083 and parameters: {'learning_rate': 0.22999586428143728, 'bagging_temperature': 0.022592797420156956, 'n_estimators': 2764, 'max_depth': 4, 'random_strength': 32, 'colsample_bylevel': 0.6332063738136893, 'l2_leaf_reg': 8.147757462899138e-06, 'min_child_samples': 84, 'max_bin': 307, 'od_type': 'Iter'}. Best is trial 2 with value: 0.6747371939040755.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6755711976
bestIteration = 76

Shrink model to first 77 iterations.
0:	learn: 1.0865877	test: 1.0865877	test1: 1.0892923	best: 1.0892923 (0)	total: 2.72s	remaining: 1h 15m 44s
100:	learn: 0.5328258	test: 0.5866714	test1: 0.7921598	best: 0.7921598 (100)	total: 4m 30s	remaining: 1h 10m 9s
200:	learn: 0.3504514	test: 0.4538978	test1: 0.7692114	best: 0.7687087 (178)	total: 9m 10s	remaining: 1h 7m 6s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7687086611
bestIteration = 178

Shrink model to first 179 iterations.


[32m[I 2022-11-30 10:30:13,200][0m Trial 6 finished with value: 0.7687086611234646 and parameters: {'learning_rate': 0.016149614799999188, 'bagging_temperature': 16.172900811143155, 'n_estimators': 1671, 'max_depth': 16, 'random_strength': 77, 'colsample_bylevel': 0.5192294089205034, 'l2_leaf_reg': 1.7560829253683595e-07, 'min_child_samples': 83, 'max_bin': 412, 'od_type': 'Iter'}. Best is trial 2 with value: 0.6747371939040755.[0m


0:	learn: 1.0928729	test: 1.0928729	test1: 1.0929246	best: 1.0929246 (0)	total: 15.3ms	remaining: 31.2s
100:	learn: 0.8710061	test: 0.8224832	test1: 0.8623075	best: 0.8623075 (100)	total: 14.1s	remaining: 4m 30s
200:	learn: 0.8286465	test: 0.7384830	test1: 0.8119751	best: 0.8119751 (200)	total: 28.4s	remaining: 4m 20s
300:	learn: 0.8018515	test: 0.6680194	test1: 0.7791920	best: 0.7791920 (300)	total: 56.6s	remaining: 5m 27s
400:	learn: 0.7869539	test: 0.6319669	test1: 0.7611875	best: 0.7611875 (400)	total: 1m 9s	remaining: 4m 44s
500:	learn: 0.7788715	test: 0.6079239	test1: 0.7522653	best: 0.7522653 (500)	total: 1m 27s	remaining: 4m 29s
600:	learn: 0.7680890	test: 0.5813565	test1: 0.7416871	best: 0.7416871 (600)	total: 1m 48s	remaining: 4m 20s
700:	learn: 0.7591783	test: 0.5638031	test1: 0.7337279	best: 0.7337279 (700)	total: 2m 14s	remaining: 4m 16s
800:	learn: 0.7501072	test: 0.5469933	test1: 0.7247489	best: 0.7247489 (800)	total: 2m 31s	remaining: 3m 54s
900:	learn: 0.7313032	test: 

[32m[I 2022-11-30 10:38:13,533][0m Trial 7 finished with value: 0.6763703233450424 and parameters: {'learning_rate': 0.012863908101989912, 'bagging_temperature': 0.27155819552829413, 'n_estimators': 2042, 'max_depth': 15, 'random_strength': 62, 'colsample_bylevel': 0.5985388149115896, 'l2_leaf_reg': 1.9161149250778487e-06, 'min_child_samples': 34, 'max_bin': 297, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6747371939040755.[0m


0:	learn: 1.0159494	test: 1.0159524	test1: 1.0180829	best: 1.0180829 (0)	total: 30.1ms	remaining: 1m 2s
100:	learn: 0.3346389	test: 0.4361284	test1: 0.7394030	best: 0.6993701 (72)	total: 26.1s	remaining: 8m 31s


[32m[I 2022-11-30 10:38:44,010][0m Trial 8 finished with value: 0.6993700589281642 and parameters: {'learning_rate': 0.20441878352493792, 'bagging_temperature': 0.7742116473996251, 'n_estimators': 2076, 'max_depth': 13, 'random_strength': 76, 'colsample_bylevel': 0.7367663185416977, 'l2_leaf_reg': 2.3131305726837285e-05, 'min_child_samples': 52, 'max_bin': 357, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6747371939040755.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6993700589
bestIteration = 72

Shrink model to first 73 iterations.
0:	learn: 1.0921013	test: 1.0921450	test1: 1.0922042	best: 1.0922042 (0)	total: 122ms	remaining: 13m 40s
100:	learn: 0.8688952	test: 0.8262831	test1: 0.8630202	best: 0.8630202 (100)	total: 4.28s	remaining: 4m 40s
200:	learn: 0.8293842	test: 0.7404107	test1: 0.8140955	best: 0.8140955 (200)	total: 8.48s	remaining: 4m 35s
300:	learn: 0.8061092	test: 0.6751043	test1: 0.7827051	best: 0.7827051 (300)	total: 12.8s	remaining: 4m 32s
400:	learn: 0.7905250	test: 0.6260303	test1: 0.7620892	best: 0.7620890 (399)	total: 18s	remaining: 4m 44s
500:	learn: 0.7799512	test: 0.5969967	test1: 0.7499649	best: 0.7499649 (500)	total: 23.3s	remaining: 4m 48s
600:	learn: 0.7713141	test: 0.5797212	test1: 0.7413908	best: 0.7413908 (600)	total: 28.5s	remaining: 4m 51s
700:	learn: 0.7588057	test: 0.5565381	test1: 0.7286432	best: 0.7286432 (700)	total: 33.5s	remaining: 4m 47s
800:	

[32m[I 2022-11-30 10:40:18,501][0m Trial 9 finished with value: 0.6803320350932119 and parameters: {'learning_rate': 0.01443340240633889, 'bagging_temperature': 0.0133572404119741, 'n_estimators': 6728, 'max_depth': 8, 'random_strength': 51, 'colsample_bylevel': 0.9445398843556558, 'l2_leaf_reg': 7.486273952174759e-06, 'min_child_samples': 44, 'max_bin': 427, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6747371939040755.[0m


Best Score: 0.6747371939040755
Best trial {'learning_rate': 0.027010527749605478, 'bagging_temperature': 0.2920433847181412, 'n_estimators': 5105, 'max_depth': 14, 'random_strength': 20, 'colsample_bylevel': 0.708540663048167, 'l2_leaf_reg': 1.7776512920172654e-05, 'min_child_samples': 9, 'max_bin': 382, 'od_type': 'IncToDec'}


In [26]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['credit']):
  folds.append((train_idx,valid_idx))

In [27]:
random.seed(42)
cat_models={}

cat_features =[0,1,2,5,6,7,8,15,18]

for fold in range(5):
  print(f'===================================={fold+1}============================================')
  train_idx, valid_idx = folds[fold]
  X_train = train.drop(['credit'],axis=1).iloc[train_idx].values 
  X_valid = train.drop(['credit'],axis=1).iloc[valid_idx].values
  y_train = train['credit'][train_idx].values
  y_valid = train['credit'][valid_idx].values

  cat = CatBoostClassifier(**study.best_params)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_models[fold] = cat
  print(f'================================================================================\n\n')

0:	learn: 1.0867045	test: 1.0867045	test1: 1.0867133	best: 1.0867133 (0)	total: 9.63ms	remaining: 49.1s
100:	learn: 0.8209018	test: 0.7588055	test1: 0.8078730	best: 0.8078730 (100)	total: 13.5s	remaining: 11m 10s
200:	learn: 0.7655642	test: 0.6160920	test1: 0.7356912	best: 0.7356912 (200)	total: 30.4s	remaining: 12m 21s
300:	learn: 0.7428103	test: 0.5657018	test1: 0.7094751	best: 0.7094751 (300)	total: 47.8s	remaining: 12m 42s
400:	learn: 0.7220111	test: 0.5323016	test1: 0.6913722	best: 0.6913722 (399)	total: 1m 7s	remaining: 13m 7s
500:	learn: 0.6233379	test: 0.4886209	test1: 0.6713989	best: 0.6713989 (500)	total: 2m 5s	remaining: 19m 10s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6673257765
bestIteration = 556

Shrink model to first 557 iterations.


0:	learn: 1.0867045	test: 1.0867045	test1: 1.0867133	best: 1.0867133 (0)	total: 11.5ms	remaining: 58.7s
100:	learn: 0.8109825	test: 0.7379882	test1: 0.7993746	best: 0.7993746 (100)	total: 15.4s	remaining: 12m 41s

#Test inference
  * 각 fold를 훈련시킨 catboost model로 predict
  * 해당 대회는 logloss score를 겨루는 것이기 때문에 각  class의 probability를 얻어야함
  * 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함
  * predict는 class 출력을 해주고 predict_probal는 class별 probability를 출력해줌
  * predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블함

In [28]:
submit.iloc[:,1:]=0
for fold in range(5):
  submit.iloc[:,1:] += cat_models[fold].predict_proba(test)/5

In [31]:
import datetime
now = datetime.datetime.now()
created_time = now.strftime('%m%d-%H%M')

submit.to_csv(f'/content/drive/MyDrive/ml_project/{created_time}_submit.csv',index=False)

In [32]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.113632,0.153429,0.732939
1,26458,0.260412,0.257944,0.481644
2,26459,0.034347,0.086973,0.87868
3,26460,0.060618,0.084505,0.854877
4,26461,0.092354,0.208302,0.699344
5,26462,0.046762,0.149603,0.803635
6,26463,0.592821,0.383685,0.023494
7,26464,0.058617,0.108954,0.832429
8,26465,0.056959,0.10509,0.837951
9,26466,0.059942,0.306479,0.63358
