# 1. 라이브러리

In [1]:
import pandas as pd
import numpy as np

from pycaret.regression import *

from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

# 2. 데이터 로드

In [5]:
train = pd.read_csv('data/train0723(5).3.csv')
test = pd.read_csv('data/test0723.fi.csv')
train = train.set_index("code")
test = test.set_index("code")

# 3. 모델링

In [13]:
X = train.drop(columns = ['target'])
y = train['target']

- 파라미터 튜닝

In [14]:
def objective(trial: Trial) -> float:
    params_cat = {
        "random_state": 42,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "verbose" : 1,
        "objective" : "MAE",
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        early_stopping_rounds=10,
        verbose=False,
    )

    cat_pred = model.predict(X_val)
    log_score = mean_absolute_error(y_val, cat_pred)
    
    return log_score

In [15]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="cat_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-07-25 22:57:23,822][0m A new study created in memory with name: cat_opt[0m
[32m[I 2021-07-25 22:57:24,205][0m Trial 0 finished with value: 107.7370610808347 and parameters: {'max_depth': 6, 'colsample_bylevel': 0.9901428612819833, 'subsample': 0.8123957592679836, 'min_child_samples': 62, 'max_bin': 246}. Best is trial 0 with value: 107.7370610808347.[0m
[32m[I 2021-07-25 22:57:24,431][0m Trial 1 finished with value: 104.71779739849916 and parameters: {'max_depth': 3, 'colsample_bylevel': 0.8116167224336399, 'subsample': 0.9063233020424546, 'min_child_samples': 62, 'max_bin': 413}. Best is trial 1 with value: 104.71779739849916.[0m
[32m[I 2021-07-25 22:57:24,526][0m Trial 2 finished with value: 149.0360384399287 and parameters: {'max_depth': 1, 'colsample_bylevel': 0.9939819704323989, 'subsample': 0.8827098485602951, 'min_child_samples': 25, 'max_bin': 254}. Best is trial 1 with value: 104.71779739849916.[0m
[32m[I 2021-07-25 22:57:24,727][0m Trial 3 finished w

Best Score: 104.71779739849916
Best trial: {'max_depth': 3, 'colsample_bylevel': 0.8116167224336399, 'subsample': 0.9063233020424546, 'min_child_samples': 62, 'max_bin': 413}


In [16]:
cat_p = study.best_trial.params
cat = CatBoostRegressor(**cat_p)

- StratifiedK-Fold for Regression

In [17]:
y_cat = pd.cut(y, 10, labels=range(10))
skf = StratifiedKFold(5)

preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    
    cat.fit(X_tr, y_tr, verbose = 0)
    
    pred = cat.predict(test)
    preds.append(pred)
cat_pred = np.mean(preds, axis = 0)

In [18]:
sample = pd.read_csv('data/sample_submission.csv')
sample['num'] = cat_pred
sample.to_csv('sub/cat0725.csv', index=False)

In [20]:
sample

Unnamed: 0,code,num
0,C1072,748.718583
1,C1128,1245.517642
2,C1456,516.457748
3,C1840,510.716498
4,C1332,1158.628067
...,...,...
145,C2456,218.693726
146,C1266,425.015487
147,C2152,60.238010
148,C1267,345.978692
