In [1]:
!pip install catboost

import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from datetime import date, timedelta
import time
from google.colab import drive
drive.mount('/content/drive')
import random
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, PowerTransformer,QuantileTransformer ,LabelEncoder, OneHotEncoder, MinMaxScaler
import seaborn as sns
! pip install prince
import prince

! pip install optuna
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.8MB 71kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4
Mounted at /content/drive
Collecting prince
  Downloading https://files.pythonhosted.org/packages/94/6c/491a3fabfd1ce75e285a4fe4200fccde5d83664733541a3a74c0b02e77fb/prince-0.7.1-py3-none-any.whl
Installing collected packages: prince
Successfully installed prince-0.7.1
Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/91/88/9c53460b97c61bce926dfe9dce51e4887c283416ff89ed30af0b73f44efa/optuna-2.5.0-py3-none-any.whl (287kB)
[K     |████████████████████████████████| 296kB 13.1MB/s 
Collecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/a9/53/daab5c96e22e9ed1c9f8ca4e3256e72213ade42d519b6254c32e59610967/al

In [2]:
from sklearn.metrics import mean_squared_error

In [3]:
path = '/content/drive/Shareddrives/dacon/Playground/'
train = pd.read_csv(path + 'train.csv', index_col=['id'])
test = pd.read_csv(path + 'test.csv', index_col=['id'])
submission = pd.read_csv(path + 'sample_submission.csv', index_col=['id'])

train_data = train.iloc[:,:-1]
train_target = train.iloc[:, -1]
test_data = test.copy()

In [4]:
cat_features = [feature for feature in train_data.columns if 'cat' in feature]
cont_features = [feature for feature in train_data.columns if 'cont' in feature]
for feature in cat_features:
    train_data[feature] = train_data[feature].astype('category')
    test_data[feature] = test_data[feature].astype('category')

In [9]:
def objective_CAT(trial: Trial, X, y):

  params = {} #initialize parameters
  params['random_state'] = 42
  params['iterations'] = 500
  params['subsample'] = trial.suggest_loguniform('subsample', 0.3, 1)
  params['max_depth'] = trial.suggest_int('max_depth', 3, 10)
  params['l2_leaf_reg'] = trial.suggest_loguniform('l2_leaf_reg', 0.3, 100)
  params['objective'] = trial.suggest_categorical('objective', ['MAE', 'MAPE', 'Poisson', 'RMSE'])
  params['bootstrap_type'] = trial.suggest_categorical('bootstrap_type', ['Bernoulli', 'MVS', 'Poisson'])
  params['min_data_in_leaf'] = trial.suggest_int('min_data_in_leaf', 1, 200)
  # params['max_leaves'] = trial.suggest_int('max_leaves', 20, 200)
  params['cat_features'] = cat_features
  params['task_type'] = 'GPU'
  params['max_bin'] = 36
  params['learning_rate'] = 0.01


  # train_data = X.iloc[:, :-1]
  # train_target = X.iloc[:,-1]
  X_train, X_valid, Y_train,  Y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
  # print('#############################    BEGIN   #######################################')
  model = catboost.CatBoostRegressor(**params)
  model.fit(X_train, Y_train,eval_set=[(X_valid, Y_valid)], early_stopping_rounds=500, verbose=False)
  # print('#############################    END   #######################################')

  pred = model.predict(train_data)
  # score = cv_evaluation_regression(X, model)
  score = mean_squared_error(pred, train_target)

  return score

In [10]:
study = optuna.create_study(direction = 'minimize')
study.optimize(lambda trial : objective_CAT(trial, train_data, train_target), n_trials=200)

[32m[I 2021-02-11 14:12:22,479][0m A new study created in memory with name: no-name-58a1221e-66c7-4047-96d8-d101fb069884[0m
[32m[I 2021-02-11 14:12:39,547][0m Trial 0 finished with value: 0.7902204452569493 and parameters: {'subsample': 0.5082696741792726, 'max_depth': 9, 'l2_leaf_reg': 12.044556297864183, 'objective': 'MAPE', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 141}. Best is trial 0 with value: 0.7902204452569493.[0m
[32m[I 2021-02-11 14:12:54,303][0m Trial 1 finished with value: 0.79881175658621 and parameters: {'subsample': 0.5902417978315306, 'max_depth': 7, 'l2_leaf_reg': 2.462268350464739, 'objective': 'MAPE', 'bootstrap_type': 'MVS', 'min_data_in_leaf': 25}. Best is trial 0 with value: 0.7902204452569493.[0m
[32m[I 2021-02-11 14:13:07,274][0m Trial 2 finished with value: 0.746300176580536 and parameters: {'subsample': 0.5003940523955505, 'max_depth': 5, 'l2_leaf_reg': 20.00710127448401, 'objective': 'RMSE', 'bootstrap_type': 'Bernoulli', 'min_data_in_le

In [11]:
optuna.visualization.plot_optimization_history(study)

In [12]:
study.best_params

{'bootstrap_type': 'Bernoulli',
 'l2_leaf_reg': 0.5925500443911146,
 'max_depth': 10,
 'min_data_in_leaf': 95,
 'objective': 'RMSE',
 'subsample': 0.9965285997680801}