# Setting

In [None]:
!pip install catboost

In [None]:
!pip install optuna


In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

import optuna
from optuna.samplers import TPESampler

# Load Data

In [None]:
df_submission_path = '/Users/namwoo/Desktop/jeju/sample_submission.csv'
df_train_path = '/Users/namwoo/Desktop/jeju/preprocessed_df_train_26.csv'
df_test_path = '/Users/namwoo/Desktop/jeju/preprocessed_df_test_26.csv'

In [None]:
df_train = pd.read_csv(df_train_path)
df_test = pd.read_csv(df_test_path)

# Preprocessing

In [None]:
X = df_train.drop(columns=['target', 'base_date'])
y = df_train['target']

In [None]:
cat_cols = X.dtypes[X.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(cat_cols))

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                        test_size=0.1,
                                                        random_state=2022)

# Optuna

In [None]:
EARLY_STOPPING_ROUND = 100

def objective(trial):

  param = {
    "n_estimators":1000,
    'learning_rate' : trial.suggest_loguniform('learning_rate', 0.1, 0.2),
    'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    # "n_estimators":trial.suggest_int("n_estimators", 1000, 20000),
    "max_depth":trial.suggest_int("max_depth", 4, 16),
    'random_strength' :trial.suggest_int('random_strength', 0, 100),
    "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    "max_bin": trial.suggest_int("max_bin", 100, 500)
    }

  regressor = CatBoostRegressor(**param,
                                task_type='CPU',
                                loss_function='RMSE',
                                use_best_model=True)

  regressor.fit(X_train.copy(), y_train.copy(),
                eval_set=[(X_valid.copy(), y_valid.copy())],
                cat_features=cat_cols,
                early_stopping_rounds=EARLY_STOPPING_ROUND,
                verbose=10)
  
  loss = mean_squared_error(y_valid, regressor.predict(X_valid.copy()), squared=False)
  
  return loss

In [None]:
sampler = TPESampler(seed=2022)

study = optuna.create_study(sampler=sampler,                          
                            study_name='catboost')

study.optimize(objective, n_trials=30)

print(study.best_value)
print(study.best_params)

In [None]:
regressor = CatBoostRegressor(**param,
                              task_type='CPU',
                              loss_function='MAE',
                              use_best_model=True)

regressor.fit(X_train.copy(), y_train.copy(),
                eval_set=[(X_valid.copy(), y_valid.copy())],
                cat_features=cat_cols,
                early_stopping_rounds=EARLY_STOPPING_ROUND,
                verbose=10)

In [None]:
sample_submission = pd.read_csv(df_submission_path)

sample_submission.iloc[:,1:] = regressor.predict(df_testb)

In [None]:
sample_submission.to_csv("./optuna_Data_26_weight.csv", index = False)

In [None]:
sample_submission