In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import json

import catboost as cb
from sklearn.model_selection import StratifiedGroupKFold

print("Catboost version:", cb.__version__)

import optuna
from optuna.visualization import (
    plot_edf
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

print("Optuna version:", optuna.__version__)

# local modules
import sys
sys.path.append("../src")
from preproc import process_train_data

Catboost version: 1.2.7


  from .autonotebook import tqdm as notebook_tqdm


Optuna version: 4.0.0


***
### load and preprocess data

In [2]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
path_results = Path("../data/results")

# load data
df_train = pd.read_csv(path_raw / "train.csv")
df_test = pd.read_csv(path_raw / "test.csv")

df_train

Unnamed: 0,Id,GameRulesetName,agent1,agent2,Properties,Format,Time,Discrete,Realtime,Turns,...,DoLudeme,Trigger,PlayoutsPerSecond,MovesPerSecond,EnglishRules,LudRules,num_wins_agent1,num_draws_agent1,num_losses_agent1,utility_agent1
0,0,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-ProgressiveHistory-0.6-Random200-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",4,0,11,-0.466667
1,1,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-UCB1GRAVE-0.6-NST-true,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
2,2,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.1-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",7,0,8,-0.066667
3,3,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.6-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
4,4,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233229,233229,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-NST-false,MCTS-ProgressiveHistory-1.41421356237-Random20...,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",2,0,13,-0.733333
233230,233230,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1-0.6-MAST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",9,1,5,0.266667
233231,233231,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",11,3,1,0.666667
233232,233232,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-true,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",24,2,4,0.666667


In [3]:
# Load the feature selection results
with open('../feat_selection/select_optuna_lgbm.json', 'r') as f:
    feature_selection = json.load(f)

# Extract the selected features
numerical_cols = feature_selection.get('numerical', [])
categorical_cols = feature_selection.get('categorical', [])

print("Numerical features:", len(numerical_cols))
print("Categorical features:", len(categorical_cols))

Numerical features: 296
Categorical features: 5


In [4]:
df_train, numerical_cols, categorical_cols, encoder, scaler = process_train_data(
    df_train,
    scale=False,
    numerical_cols=numerical_cols,
    categorical_cols=categorical_cols
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

number of all nan cols:  0
number of constant cols:  0
Numerical Columns: 296
Categorical Columns: 5


***
### optimization



In [5]:
# Define the number of folds for cross-validation
num_folds = 5

# Define the column for stratified or group k-fold
group_col = "GameRulesetName"
y_col = "utility_agent1_rank"
gkf = StratifiedGroupKFold(n_splits=num_folds, random_state=2114, shuffle=True)
split_list = list(gkf.split(df_train, groups=df_train[group_col], y=df_train[y_col]))



In [6]:
def train_and_score(params):
    target = 'utility_agent1'
    oof_scores = []

    # Perform cross-validation
    for _, (train_index, val_index) in enumerate(split_list, 1):
        # Split the data
        X_train, X_val = df_train.iloc[train_index], df_train.iloc[val_index]
        y_train, y_val = X_train[target], X_val[target]
        
        # Initialize CatBoost Pool
        train_pool = cb.Pool(
            data=X_train[numerical_cols + categorical_cols],
            label=y_train,
            cat_features=categorical_cols
        )
        val_pool = cb.Pool(
            data=X_val[numerical_cols + categorical_cols],
            label=y_val,
            cat_features=categorical_cols
        )

        # Initialize CatBoost Regressor
        model = cb.CatBoostRegressor(**params)
        
        # Train the model
        model.fit(
            train_pool,
            verbose=False
        )

        # predict on validation set
        y_pred = model.predict(val_pool)
        y_pred = np.clip(y_pred, -1, 1)
        
        # Compute RMSE on scaled values
        rmse = np.sqrt(np.mean((y_pred - y_val) ** 2))
        oof_scores.append(rmse)

    return np.mean(oof_scores)

In [7]:
# Define the parameters
fixed_params = {
    'objective': "RMSE",
    'verbose': False,
    'random_seed': 2112,
    'learning_rate': 0.1,
    'iterations': 1000,
    'bootstrap_type': 'Bayesian',
    'sampling_frequency': 'PerTreeLevel',
}

def objective(trial):
    # params sampling
    params = {
        'depth': trial.suggest_int('depth', 6, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 1e1, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-2, 1e1, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.5),
        'rsm': trial.suggest_float('rsm', 0.4, 1.0),
        # 'subsample': trial.suggest_float('subsample', 0.6, 1.0),  
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 500),
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 1, 15),
        **fixed_params
    }
    
    # Call the train_and_score function
    score = train_and_score(params)
    
    return score

In [8]:
do_optimize = False
timeout = 3600 * 48

study = optuna.create_study(
    study_name="catboost_bayesian.db",
    direction='minimize',
    storage='sqlite:///catboost_bayesian.db',
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(
        n_startup_trials=25,
        n_ei_candidates=50,
    ),
)

if do_optimize:

    study.optimize(
        objective, 
        n_trials=5000, 
        timeout=timeout,
        n_jobs=1, 
        gc_after_trial=True,
    )

[I 2024-11-11 08:41:39,928] Using an existing study with name 'catboost_bayesian.db' instead of creating a new one.


In [9]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_depth,params_l2_leaf_reg,params_max_ctr_complexity,params_min_data_in_leaf,params_random_strength,params_rsm,state
441,441,0.427265,2024-11-11 00:21:23.242418,2024-11-11 00:27:17.249201,0 days 00:05:54.006783,0.159672,10,0.935929,10,102,0.166933,0.548125,COMPLETE
519,519,0.427317,2024-11-11 07:54:41.529511,2024-11-11 08:00:21.500174,0 days 00:05:39.970663,0.111046,10,0.736915,11,408,0.012407,0.497634,COMPLETE
329,329,0.427353,2024-11-10 10:18:16.081060,2024-11-10 10:25:40.532949,0 days 00:07:24.451889,0.310146,11,1.203798,10,237,0.023285,0.486168,COMPLETE
235,235,0.427376,2024-11-09 22:18:11.190335,2024-11-09 22:26:37.433834,0 days 00:08:26.243499,0.329326,11,0.04387,5,66,0.02881,0.693598,COMPLETE
326,326,0.42745,2024-11-10 09:55:51.287439,2024-11-10 10:03:19.260979,0 days 00:07:27.973540,0.371382,11,1.278825,9,381,0.021534,0.496029,COMPLETE
211,211,0.427597,2024-11-09 19:11:37.052777,2024-11-09 19:18:45.802924,0 days 00:07:08.750147,0.4273,11,0.719573,9,473,0.016077,0.430568,COMPLETE
313,313,0.427615,2024-11-10 08:15:23.235815,2024-11-10 08:23:01.369309,0 days 00:07:38.133494,0.394432,11,0.956269,9,135,0.016719,0.532703,COMPLETE
404,404,0.427691,2024-11-10 20:50:30.231170,2024-11-10 20:56:10.590821,0 days 00:05:40.359651,0.421119,10,1.197374,11,322,0.116033,0.494596,COMPLETE
303,303,0.427762,2024-11-10 07:05:59.262854,2024-11-10 07:13:32.125944,0 days 00:07:32.863090,0.37022,11,1.131179,10,113,0.047943,0.521334,COMPLETE
358,358,0.427788,2024-11-10 16:26:57.768848,2024-11-10 16:33:11.129164,0 days 00:06:13.360316,0.2622,10,0.940507,12,384,0.124107,0.620315,COMPLETE


In [10]:
plot_optimization_history(study)

In [11]:
plot_param_importances(study)

In [12]:
plot_slice(study)


In [13]:
plot_edf(study)


In [14]:
plot_parallel_coordinate(study)


In [15]:
best_params = dict(study.best_params)
best_params = {**fixed_params, **best_params}
best_params

{'objective': 'RMSE',
 'verbose': False,
 'random_seed': 2112,
 'learning_rate': 0.1,
 'iterations': 1000,
 'bootstrap_type': 'Bayesian',
 'sampling_frequency': 'PerTreeLevel',
 'depth': 10,
 'l2_leaf_reg': 0.9359292512255313,
 'random_strength': 0.16693257025108216,
 'bagging_temperature': 0.15967219011745193,
 'rsm': 0.548125395769786,
 'min_data_in_leaf': 102,
 'max_ctr_complexity': 10}

***