# Setting

In [1]:
!pip install catboost



In [2]:
!pip install optuna




In [3]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

import optuna
from optuna.samplers import TPESampler

# Load Data

In [4]:
df_submission_path = '/Users/namwoo/Desktop/jeju/sample_submission.csv'
df_train_path = '/Users/namwoo/Desktop/jeju/preprocessed_df_train_26.csv'
df_test_path = '/Users/namwoo/Desktop/jeju/preprocessed_df_test_26.csv'

In [5]:
df_train = pd.read_csv(df_train_path)
df_test = pd.read_csv(df_test_path)

# Preprocessing

In [6]:
X = df_train.drop(columns=['target', 'base_date'])
y = df_train['target']

In [7]:
cat_cols = X.dtypes[X.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(cat_cols))

Number of Categorical features:  36


In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                        test_size=0.1,
                                                        random_state=2022)

# Optuna

In [9]:
EARLY_STOPPING_ROUND = 100

def objective(trial):

  param = {
    "n_estimators":1000,
    'learning_rate' : trial.suggest_loguniform('learning_rate', 0.1, 0.2),
    'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    # "n_estimators":trial.suggest_int("n_estimators", 1000, 20000),
    "max_depth":trial.suggest_int("max_depth", 4, 16),
    'random_strength' :trial.suggest_int('random_strength', 0, 100),
    "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    "max_bin": trial.suggest_int("max_bin", 100, 500)
    }

  regressor = CatBoostRegressor(**param,
                                task_type='CPU',
                                loss_function='RMSE',
                                use_best_model=True)

  regressor.fit(X_train.copy(), y_train.copy(),
                eval_set=[(X_valid.copy(), y_valid.copy())],
                cat_features=cat_cols,
                early_stopping_rounds=EARLY_STOPPING_ROUND,
                verbose=10)
  
  loss = mean_squared_error(y_valid, regressor.predict(X_valid.copy()), squared=False)
  
  return loss

In [10]:
sampler = TPESampler(seed=2022)

study = optuna.create_study(sampler=sampler,                          
                            study_name='catboost')

study.optimize(objective, n_trials=30)

print(study.best_value)
print(study.best_params)

[32m[I 2022-10-22 23:06:28,499][0m A new study created in memory with name: catboost[0m


0:	learn: 14.7709513	test: 14.7448715	best: 14.7448715 (0)	total: 6.14s	remaining: 1h 42m 15s
10:	learn: 8.6817121	test: 8.7031852	best: 8.7031852 (10)	total: 38.5s	remaining: 57m 43s
20:	learn: 7.1215323	test: 7.1575731	best: 7.1575731 (20)	total: 1m 10s	remaining: 55m 6s
30:	learn: 6.7278783	test: 6.7431912	best: 6.7431912 (30)	total: 1m 45s	remaining: 54m 47s
40:	learn: 6.5539809	test: 6.5507704	best: 6.5507704 (40)	total: 2m 23s	remaining: 55m 52s
50:	learn: 6.4521423	test: 6.4393387	best: 6.4393387 (50)	total: 3m 4s	remaining: 57m 16s
60:	learn: 6.3828520	test: 6.3704936	best: 6.3704936 (60)	total: 3m 39s	remaining: 56m 13s
70:	learn: 6.3268218	test: 6.3128484	best: 6.3128484 (70)	total: 4m 15s	remaining: 55m 37s
80:	learn: 6.2640672	test: 6.2462690	best: 6.2462690 (80)	total: 4m 52s	remaining: 55m 20s
90:	learn: 6.2114049	test: 6.1909813	best: 6.1909813 (90)	total: 5m 28s	remaining: 54m 36s
100:	learn: 6.1739952	test: 6.1506509	best: 6.1506509 (100)	total: 6m 4s	remaining: 54m
11

KeyboardInterrupt: 

In [None]:
regressor = CatBoostRegressor(**param,
                              task_type='CPU',
                              loss_function='MAE',
                              use_best_model=True)

regressor.fit(X_train.copy(), y_train.copy(),
                eval_set=[(X_valid.copy(), y_valid.copy())],
                cat_features=cat_cols,
                early_stopping_rounds=EARLY_STOPPING_ROUND,
                verbose=10)

In [None]:
sample_submission = pd.read_csv(df_submission_path)

sample_submission.iloc[:,1:] = regressor.predict(df_testb)

In [None]:
sample_submission.to_csv("./optuna_Data_26_weight.csv", index = False)

In [None]:
sample_submission