# Import

In [1]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from catboost import Pool, CatBoostRegressor, cv

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from optuna.integration.mlflow import MLflowCallback

import warnings
warnings.filterwarnings('ignore')

# Seed hold

In [2]:
def seed_all(seed):

    print("Using Seed Number {}".format(seed))

    os.environ["PYTHONHASHSEED"] = str(seed)  
    np.random.seed(seed) 
    random.seed(seed)
    
seed = 1215
seed_all(seed=seed)

Using Seed Number 1215


# Data load

In [3]:
def load_data(df):

    # make month & day
    df["month"] = df["base_date"].apply(lambda x : str(x)[4:6])
    df["day"] = df["base_date"].apply(lambda x : str(x)[6:8])

    # set category
    globals()['cat_features'] = ['base_date', 'day_of_week', 'base_hour', 'lane_count', 'road_rating', 
                'multi_linked', 'connect_code', 'maximum_speed_limit', 'weight_restricted', 
                'height_restricted', 'road_type', 'start_turn_restricted',
                'end_turn_restricted', 'holidays', 'month', 'day', 'road_name', 'start_node_name', 'end_node_name']

    # set numeric          
    globals()['numerial_features'] = ['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude', '제주 강수량', 
                        '제주 적설', '고산 강수량', '고산 적설', '성산 강수량', '성산 적설', 
                        '서귀포 강수량', '서귀포 적설',]

    # to datetime
    df['시간'] = pd.to_datetime(df['시간'])

    # cat feature to string
    for col in cat_features:
        df[col] = df[col].astype(str)
    
    return df

train = load_data(pd.read_csv('train_for_cbr.csv'))
test = load_data(pd.read_csv('test_for_cbr.csv'))

# seperate X, y
X_train = train.drop(['target'], axis=1)
y_train = train['target']

# CBR Optuna

## hyperparameter search

hyper parameter 찾고 feature importance로 feature 버려보기, engineering

### using cross validation(데이터의 양이 많아 너무 많은 시간 소요)

#### make pool

In [4]:
# make Pool
train_pool = Pool(X_train, y_train, cat_features=cat_features, timestamp=X_train['시간'])
test_pool = Pool(test, cat_features=cat_features, timestamp=test['시간'])

#### search

In [14]:
class Objective(object):
    def __init__(self, pool):
        self.pool = pool

    def __call__(self, trial):
        pool = self.pool
        
        CBR_params = {# prefix
                 'task_type' : 'GPU',
                 'random_seed' : seed,
                 'iterations' : 3000,
                 'early_stopping_rounds' : 50,

                 # found
                 # 'loss_function' : trial.suggest_categorical("loss_function", ['MAPE', 'Poisson', 'RMSE']),
                 'loss_function' : 'RMSE',

                 # trial
                 'learning_rate' : trial.suggest_uniform("learning_rate", 1e-4, 3e-1),
                 'depth' : trial.suggest_int("depth", 2, 16),
                 'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-5, 5e-1, log=True),
                 }
        
        cv_result = cv(pool=pool,
                       params=CBR_params,
                       nfold=5,
                       verbose=False,)
        
        rmsle = cv_result["test-RMSE-mean"].min()
        
        return rmsle

CBR_objective = Objective(train_pool)

# random sampler
sampler = TPESampler(seed=seed)

CBR_study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.HyperbandPruner(max_resource="auto"),
    sampler=sampler
    )

[32m[I 2022-10-14 04:14:47,086][0m A new study created in memory with name: CBR0[0m


In [15]:
def make_mlflow_callback():
    cb = MLflowCallback(
        tracking_uri="mlruns",
        metric_name="RMSLE"
    )
    return cb

mlflow_cb = make_mlflow_callback()
CBR_study.optimize(CBR_objective, n_trials=30, callbacks=[mlflow_cb])

# data 폴더로 가서 mlflow ui 실행
# http://localhost:5000/ 로 접속

Training on fold [0/5]
bestTest = 4.888740913
bestIteration = 2999
Training on fold [1/5]
bestTest = 4.916484654
bestIteration = 2998
Training on fold [2/5]
bestTest = 4.980818262
bestIteration = 2999
Training on fold [3/5]
bestTest = 4.970064378
bestIteration = 2999
Training on fold [4/5]


[32m[I 2022-10-14 06:40:59,890][0m Trial 0 finished with value: 4.942627197981751 and parameters: {'learning_rate': 0.1344064063808722, 'depth': 4, 'l2_leaf_reg': 0.0008103931299344912}. Best is trial 0 with value: 4.942627197981751.[0m
2022/10/14 06:40:59 INFO mlflow.tracking.fluent: Experiment with name 'CBR0' does not exist. Creating a new experiment.


bestTest = 4.957027783
bestIteration = 2999
Training on fold [0/5]


[33m[W 2022-10-14 06:41:21,614][0m Trial 1 failed because of the following error: CatBoostError("C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/cuda/methods/boosting_progress_tracker.cpp:171: Saved model's params are different from current model's params")[0m
Traceback (most recent call last):
  File "c:\Users\nehcr\miniconda3\envs\ml\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\nehcr\AppData\Local\Temp\ipykernel_23356\3352166716.py", line 24, in __call__
    cv_result = cv(pool=pool,
  File "c:\Users\nehcr\miniconda3\envs\ml\lib\site-packages\catboost\core.py", line 6650, in cv
    return _cv(params, pool, fold_count, inverted, partition_random_seed, shuffle, stratified,
  File "_catboost.pyx", line 5494, in _catboost._cv
  File "_catboost.pyx", line 5526, in _catboost._cv
_catboost.CatBoostError: C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/cuda/m

CatBoostError: C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/cuda/methods/boosting_progress_tracker.cpp:171: Saved model's params are different from current model's params

In [None]:
CBR_study.best_params

{'loss_function': 'RMSE',
 'learning_rate': 0.29668232997002686,
 'l2_leaf_reg': 32.44296096601971,
 'use_best_model': False,
 'depth': 16}

### using validation set

#### make pool

In [4]:
# train test split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

# make Pool
train_pool = Pool(X_train, y_train, cat_features=cat_features, timestamp=X_train['시간'])
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features, timestamp=X_valid['시간'])
test_pool = Pool(test, cat_features=cat_features, timestamp=test['시간'])

#### search

In [5]:
class Objective(object):
    def __init__(self, pool):
        self.pool = pool

    def __call__(self, trial):
        pool = self.pool
        
        CBR_params = {# prefix
                     'task_type' : 'GPU',
                     'random_seed' : seed,

                     # found
                     # 'loss_function' : trial.suggest_categorical('loss_function', ['MAPE', 'Poisson', 'RMSE']),
                     'loss_function' : 'RMSE',
                     # 'iterations' : trial.suggest_int('iterations', 1000, 10000),
                     'iterations' : 3000,
                     # 'max_depth' : trial.suggest_int("max_depth", 2, 16),
                     'depth' : 16,
                     # 'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
                     'learning_rate' : 0.05041997866257283,
                     # "l2_leaf_reg" : trial.suggest_float("l2_leaf_reg", 1e-8, 3e-5),
                     'l2_leaf_reg' : 3.354771769479418e-05,

                     # trial
                     'random_strength' :trial.suggest_int('random_strength', 0, 100),
                     'bagging_temperature' : trial.suggest_float("bagging_temperature", 0.01, 100.00),
                     # 'min_child_samples' : trial.suggest_int('min_child_samples', 5, 100),
                     # 'max_bin': trial.suggest_int('max_bin', 200, 500),
                      }

        # Generate model
        model = CatBoostRegressor(**CBR_params)
        model.fit(train_pool, eval_set=valid_pool, verbose=False, metric_period=12, early_stopping_rounds=24)

        # 평가지표 원하는 평가 지표가 있을 시 바꾸어 준다.
        rmse = model.get_best_score()["validation"]["RMSE"]
        return rmse

CBR_objective = Objective(train_pool)

# random sampler
sampler = TPESampler(seed=seed)

CBR_study = optuna.create_study(direction="minimize", pruner=optuna.pruners.HyperbandPruner(max_resource="auto"), sampler=sampler)

[32m[I 2022-10-15 14:23:40,897][0m A new study created in memory with name: no-name-85f97f37-6c47-4554-a265-a86332dd3f73[0m


In [6]:
def make_mlflow_callback():
    cb = MLflowCallback(
        tracking_uri="mlruns",
        metric_name="RMSLE"
    )
    return cb

mlflow_cb = make_mlflow_callback()
CBR_study.optimize(CBR_objective, n_trials=10, callbacks=[mlflow_cb])

# data 폴더로 가서 mlflow ui 실행
# http://localhost:5000/ 로 접속

[33m[W 2022-10-15 14:33:51,974][0m Trial 0 failed because of the following error: KeyboardInterrupt('')[0m
Traceback (most recent call last):
  File "c:\Users\nehcr\miniconda3\envs\ml\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\nehcr\AppData\Local\Temp\ipykernel_20268\1875062269.py", line 33, in __call__
    model.fit(train_pool, eval_set=valid_pool, verbose=False, metric_period=12, early_stopping_rounds=24)
  File "c:\Users\nehcr\miniconda3\envs\ml\lib\site-packages\catboost\core.py", line 5730, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "c:\Users\nehcr\miniconda3\envs\ml\lib\site-packages\catboost\core.py", line 2355, in _fit
    self._train(
  File "c:\Users\nehcr\miniconda3\envs\ml\lib\site-packages\catboost\core.py", line 1759, in _train
    self._object._train(train_pool, test_pool, params, allow_clea

KeyboardInterrupt: 

In [None]:
CBR_study.best_params

{'learning_rate': 0.05041997866257283,
 'depth': 16,
 'l2_leaf_reg': 3.354771769479418e-05}

#### visualization

In [None]:
# 시각화
optuna.visualization.plot_optimization_history(CBR_study)

In [None]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(CBR_study)

In [None]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(CBR_study)

## fit & predict CV

In [None]:
# remake train
X_train = train.drop(['target'], axis=1)
y_train = train['target']

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

scores = []
models = []

is_holdout = False
for tri, vai in cv.split(X_train):
    print("="*50)
    #Create an instance with tuned hyperparameters
    model = CatBoostRegressor(# prefix
                              task_type = 'GPU',
                              random_seed = seed,

                              # found
                              loss_function = 'RMSE',
                              iterations = 5000,
                              depth = 16,
                              learning_rate = 0.05041997866257283,
                              l2_leaf_reg = 3.354771769479418e-05,
                              
                              # trial
                              random_strength = CBR_study.best_params['random_strength'],
                              bagging_temperature = CBR_study.best_params['bagging_temperature'],
                              )
    # make pool
    train_pool_for_cv = Pool(X_train.iloc[tri], y_train.iloc[tri], cat_features=cat_features, timestamp=X_train.iloc[tri]['시간'])
    valid_pool_for_cv = Pool(X_train.iloc[vai], y_train.iloc[vai], cat_features=cat_features, timestamp=X_train.iloc[vai]['시간'])

    model.fit(train_pool_for_cv, eval_set=valid_pool_for_cv, verbose=96, metric_period=12, early_stopping_rounds=24, cat_features=cat_features)
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["RMSE"])
    
    if is_holdout:
        break    



CatBoostError: bad allocation

## CV result

In [None]:
print(scores)
print(np.mean(scores))

[4.286733894665547, 4.257945545899376, 4.261384909421786]
4.268688116662236


# Predict

In [21]:
pred_list = []

for model in models:
    pred_list.append(model.predict(test_pool))

pred = np.mean(pred_list, axis=0)

# 결과 저장
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['target'] = pred
sample_submission.to_csv("./submit.csv", index = False)

sample_submission

Unnamed: 0,id,target
0,TEST_000000,25.987267
1,TEST_000001,42.851339
2,TEST_000002,65.389003
3,TEST_000003,38.053313
4,TEST_000004,42.750778
...,...,...
291236,TEST_291236,48.468274
291237,TEST_291237,50.976631
291238,TEST_291238,21.946309
291239,TEST_291239,22.809484


# feature importance

In [25]:
dict(zip(X_train.columns, models[0].get_feature_importance()))

array([3.82787449e+00, 4.11971650e+00, 1.51434580e+01, 3.11903761e+00,
       1.77136321e+00, 6.03743555e+00, 8.09366131e-02, 1.11399567e-02,
       5.45658015e+00, 1.63895670e+00, 0.00000000e+00, 3.45061333e-01,
       8.99269635e+00, 3.91613743e+00, 6.22290830e+00, 3.70155939e-01,
       1.09834517e+01, 5.30380825e+00, 6.54581500e+00, 2.64585279e-01,
       5.37592476e+00, 3.29397531e-01, 3.90144092e-01, 3.09327456e-01,
       1.13840575e-01, 2.62375365e-01, 1.48617780e-01, 2.62014620e-01,
       1.37743072e-01, 1.35904753e+00, 3.32228024e+00, 3.83816861e+00])