In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import json

import catboost as cb
from sklearn.model_selection import StratifiedGroupKFold
print("CatBoost version:", cb.__version__)

import optuna
print("Optuna version:", optuna.__version__)

# local modules
import sys
sys.path.append("../src")
from preproc import process_train_data

CatBoost version: 1.2.7
Optuna version: 4.0.0


  from .autonotebook import tqdm as notebook_tqdm


***
### load and preprocess data

In [2]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
path_results = Path("../data/results")

# load data
df_train = pd.read_csv(path_raw / "train.csv")
df_test = pd.read_csv(path_raw / "test.csv")

df_train

Unnamed: 0,Id,GameRulesetName,agent1,agent2,Properties,Format,Time,Discrete,Realtime,Turns,...,DoLudeme,Trigger,PlayoutsPerSecond,MovesPerSecond,EnglishRules,LudRules,num_wins_agent1,num_draws_agent1,num_losses_agent1,utility_agent1
0,0,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-ProgressiveHistory-0.6-Random200-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",4,0,11,-0.466667
1,1,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-UCB1GRAVE-0.6-NST-true,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
2,2,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.1-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",7,0,8,-0.066667
3,3,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.6-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
4,4,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233229,233229,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-NST-false,MCTS-ProgressiveHistory-1.41421356237-Random20...,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",2,0,13,-0.733333
233230,233230,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1-0.6-MAST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",9,1,5,0.266667
233231,233231,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",11,3,1,0.666667
233232,233232,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-true,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",24,2,4,0.666667


In [3]:
df_train, numerical_cols, categorical_cols, encoder, scaler = process_train_data(
    df_train,
    scale=False,
    include_position_features=False,
    include_text_features=False,
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

number of all nan cols:  18
number of constant cols:  198
Numerical Columns: 588
Categorical Columns: 10


***
### optimization



In [4]:
# Define the number of folds for cross-validation
num_folds = 3

# Define the column for stratified or group k-fold
group_col = "GameRulesetName"
y_col = "utility_agent1_rank"
gkf = StratifiedGroupKFold(n_splits=num_folds, random_state=2113, shuffle=True)
split_list = list(gkf.split(df_train, groups=df_train[group_col], y=df_train[y_col]))



In [5]:
def train_and_score(numerical_cols, categorical_cols):

    # Define the parameters
    params = {
        'objective': "RMSE",
        'learning_rate': 0.1,
        'depth': 10,
        'iterations': 1000,
        'eval_metric': 'RMSE',
        'verbose': 0,
        'random_seed': 2112,
        'l2_leaf_reg': 1.,
        'random_strength': 0.2,
        'min_data_in_leaf': 50,
        'rsm': 0.8,
    }

    target = 'utility_agent1'
    oof_scores = []

    # Perform cross-validation
    for _, (train_index, val_index) in enumerate(split_list, 1):
        # Split the data
        X_train, X_val = df_train.iloc[train_index], df_train.iloc[val_index]
        y_train, y_val = X_train[target], X_val[target]
        
        # Create CatBoost datasets
        train_pool = cb.Pool(
            data=X_train[numerical_cols + categorical_cols],
            label=y_train,
            cat_features=categorical_cols
        )
        val_pool = cb.Pool(
            data=X_val[numerical_cols + categorical_cols],
            label=y_val,
            cat_features=categorical_cols
        )

        # Train the model
        model = cb.CatBoostRegressor(**params)
        model.fit(
            train_pool,
            verbose=False
        )

        # predict on validation set
        y_pred = model.predict(val_pool)
        y_pred = np.clip(y_pred, -1, 1)
        
        # Compute RMSE on scaled values
        rmse = np.sqrt(np.mean((y_pred - y_val) ** 2))
        oof_scores.append(rmse)

    return np.mean(oof_scores)

In [6]:
def objective(trial):
    # Feature selection
    selected_numerical = [col for col in numerical_cols if trial.suggest_int(f'use_{col}', 0, 1) == 1]
    selected_categorical = [col for col in categorical_cols if trial.suggest_int(f'use_{col}', 0, 1) == 1]

    if len(selected_numerical) + len(selected_categorical) == 0:
        return 1.
        
    # Call the train_and_score function with selected features
    score = train_and_score(selected_numerical, selected_categorical)
    return score

In [7]:
do_optimize = False
timeout = 3600 * 72

# Start with QMC for good coverage
study = optuna.create_study(
    study_name="optuna_catb",
    direction='minimize',
    storage='sqlite:///optuna_catb.db',
    load_if_exists=True,
    sampler=optuna.samplers.QMCSampler()
)

if do_optimize:
    study.optimize(objective, n_trials=200)

    # Switch to TPE for refined search
    study = optuna.create_study(
        study_name="optuna_catb",
        direction='minimize', 
        storage='sqlite:///optuna_catb.db',
        load_if_exists=True,
        sampler=optuna.samplers.TPESampler(
            n_startup_trials=1,   # Increase random sampling at start
            n_ei_candidates=100,   # Consider more candidates
            multivariate=True,     # Enable multivariate sampling
            constant_liar=True     # Help with parallel optimization
    )
    )
    study.optimize(objective, n_trials=10_000)

  sampler=optuna.samplers.QMCSampler()
[I 2024-11-16 01:18:08,073] Using an existing study with name 'optuna_catb' instead of creating a new one.


In [8]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_use_Absolute,params_use_AbsoluteDirections,params_use_AddDecision,params_use_AddDecisionFrequency,params_use_AddEffect,...,params_use_agent1_exploration_const,params_use_agent1_playout,params_use_agent1_score_bounds,params_use_agent1_selection,params_use_agent2,params_use_agent2_exploration_const,params_use_agent2_playout,params_use_agent2_score_bounds,params_use_agent2_selection,state
824,824,0.423893,2024-11-15 22:09:42.846021,2024-11-15 22:13:57.407634,0 days 00:04:14.561613,1,0,1,1,0,...,1,1,0,1,1,1,1,1,1,COMPLETE
701,701,0.42474,2024-11-15 13:39:31.067476,2024-11-15 13:43:45.795040,0 days 00:04:14.727564,0,0,1,1,0,...,1,1,0,1,0,1,1,1,1,COMPLETE
834,834,0.424985,2024-11-15 22:50:00.265788,2024-11-15 22:53:59.907351,0 days 00:03:59.641563,1,0,1,1,0,...,0,1,0,1,0,1,1,1,1,COMPLETE
857,857,0.425411,2024-11-16 00:24:07.231223,2024-11-16 00:28:25.406332,0 days 00:04:18.175109,1,0,1,1,1,...,1,1,0,1,1,1,1,0,1,COMPLETE
702,702,0.42567,2024-11-15 13:43:45.841516,2024-11-15 13:48:02.752284,0 days 00:04:16.910768,0,0,1,1,0,...,1,1,0,1,0,1,1,1,1,COMPLETE
808,808,0.425756,2024-11-15 21:04:51.740040,2024-11-15 21:09:06.353264,0 days 00:04:14.613224,1,0,1,1,0,...,1,1,0,1,0,1,1,1,1,COMPLETE
766,766,0.425851,2024-11-15 18:11:00.706903,2024-11-15 18:15:17.139083,0 days 00:04:16.432180,0,1,0,1,0,...,1,1,0,1,0,1,1,1,1,COMPLETE
829,829,0.425909,2024-11-15 22:29:54.496178,2024-11-15 22:34:13.391231,0 days 00:04:18.895053,1,0,1,1,0,...,1,1,0,1,1,1,1,0,1,COMPLETE
811,811,0.426007,2024-11-15 21:17:23.435368,2024-11-15 21:21:39.163743,0 days 00:04:15.728375,1,0,1,1,0,...,1,1,0,1,0,1,1,1,1,COMPLETE
707,707,0.426084,2024-11-15 14:04:26.783475,2024-11-15 14:08:45.241691,0 days 00:04:18.458216,0,0,1,1,0,...,1,1,0,1,0,1,1,1,1,COMPLETE


In [9]:
# Get the selected features from the best trial
best_params = study.best_trial.params

# Filter numerical and categorical features
selected_numerical = [col for col in numerical_cols if best_params.get(f'use_{col}', 0) == 1]
selected_categorical = [col for col in categorical_cols if best_params.get(f'use_{col}', 0) == 1]

# Combine selected features
selected_features = selected_numerical + selected_categorical

print("Selected features:")
print("len(numerical): ", len(selected_numerical), "/", len(numerical_cols))
print("len(categorical): ", len(selected_categorical), "/", len(categorical_cols))
print("len(selected_features): ", len(selected_features), "/", len(numerical_cols) + len(categorical_cols))


# Save selected features to a file
selected_features_dict = {
    "numerical": selected_numerical,
    "categorical": selected_categorical
}
with open('select_optuna_catb.json', 'w') as f:
    json.dump(selected_features_dict, f, indent=2)

print("Selected features have been saved to 'select_optuna_catb.json'")

Selected features:
len(numerical):  297 / 588
len(categorical):  8 / 10
len(selected_features):  305 / 598
Selected features have been saved to 'select_optuna_catb.json'


***