In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import json

import lightgbm as lgb
from sklearn.model_selection import GroupKFold

print("Lightgbm version:", lgb.__version__)

import optuna
from optuna.visualization import (
    plot_edf
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

print("Optuna version:", optuna.__version__)

# local modules
import sys
sys.path.append("../src")
from preproc import process_train_data

Lightgbm version: 4.5.0


  from .autonotebook import tqdm as notebook_tqdm


Optuna version: 4.0.0


***
### load and preprocess data

In [2]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
path_results = Path("../data/results")

# load data
df_train = pd.read_csv(path_raw / "train.csv")
df_test = pd.read_csv(path_raw / "test.csv")

df_train

Unnamed: 0,Id,GameRulesetName,agent1,agent2,Properties,Format,Time,Discrete,Realtime,Turns,...,DoLudeme,Trigger,PlayoutsPerSecond,MovesPerSecond,EnglishRules,LudRules,num_wins_agent1,num_draws_agent1,num_losses_agent1,utility_agent1
0,0,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-ProgressiveHistory-0.6-Random200-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",4,0,11,-0.466667
1,1,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-UCB1GRAVE-0.6-NST-true,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
2,2,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.1-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",7,0,8,-0.066667
3,3,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.6-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
4,4,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233229,233229,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-NST-false,MCTS-ProgressiveHistory-1.41421356237-Random20...,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",2,0,13,-0.733333
233230,233230,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1-0.6-MAST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",9,1,5,0.266667
233231,233231,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",11,3,1,0.666667
233232,233232,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-true,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",24,2,4,0.666667


In [3]:
# Load the feature selection results
with open('../feat_selection/select_optuna_lgbm.json', 'r') as f:
    feature_selection = json.load(f)

# Extract the selected features
numerical_cols = feature_selection.get('numerical', [])
categorical_cols = feature_selection.get('categorical', [])

print("Numerical features:", len(numerical_cols))
print("Categorical features:", len(categorical_cols))

Numerical features: 281
Categorical features: 6


In [4]:
df_train, numerical_cols, categorical_cols, encoder, scaler = process_train_data(
    df_train,
    scale=False,
    numerical_cols=numerical_cols,
    categorical_cols=categorical_cols
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

number of all nan cols:  0
number of constant cols:  0
Numerical Columns: 281
Categorical Columns: 6


***
### optimization



In [5]:
# Define the number of folds for cross-validation
num_folds = 5

# Shuffle the train dataset
df_train = df_train.sample(frac=1, random_state=2113).reset_index(drop=True)

# Define the column for stratified or group k-fold
groups_col = 'GameRulesetName'
gkf = GroupKFold(n_splits=num_folds)
split_list = list(gkf.split(df_train, groups=df_train[groups_col]))

In [6]:
def train_and_score(params):
    _params = params.copy()
    num_iterations = _params.pop('num_iterations')

    target = 'utility_agent1'
    oof_scores = []

    # Perform cross-validation
    for _, (train_index, val_index) in enumerate(split_list, 1):
        # Split the data
        X_train, X_val = df_train.iloc[train_index], df_train.iloc[val_index]
        y_train, y_val = X_train[target], X_val[target]
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(
            data=X_train[numerical_cols + categorical_cols],
            label=y_train,
            categorical_feature=categorical_cols,
            free_raw_data=True
        )

        # Train the model
        model = lgb.train(
            _params,
            train_data,
            num_boost_round=num_iterations,
        )

        # predict on validation set
        y_pred = model.predict(X_val[numerical_cols + categorical_cols])
        y_pred = np.clip(y_pred, -1, 1)
        
        # Compute RMSE on scaled values
        rmse = np.sqrt(np.mean((y_pred - y_val) ** 2))
        oof_scores.append(rmse)

    return np.mean(oof_scores)


In [7]:
# Define the parameters
fixed_params = {
    'objective': "regression",
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'bagging_freq': 1,
    'verbose': -1,
    'seed': 2112,
}

def objective(trial):
    # params sampling
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 100.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 100.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'num_iterations': trial.suggest_int('num_iterations', 1000, 5000),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 1000),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 100, 1000),
        'max_cat_threshold': trial.suggest_int('max_cat_threshold', 32, 512),
        'cat_l2': trial.suggest_float('cat_l2', 1e-8, 100.0, log=True),
        **fixed_params
    }
    
    # Call the train_and_score function
    score = train_and_score(params)
    
    return score

In [8]:
do_optimize = False
timeout = 3600 * 48

study = optuna.create_study(
    study_name="lightgbm.db",
    direction='minimize',
    storage='sqlite:///lightgbm.db',
    load_if_exists=True,
)

if do_optimize:

    study.optimize(
        objective, 
        n_trials=5000, 
        timeout=timeout,
        n_jobs=1, 
        gc_after_trial=True,
    ) 

[I 2024-11-01 15:13:04,246] Using an existing study with name 'lightgbm.db' instead of creating a new one.


In [9]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_fraction,params_cat_l2,params_feature_fraction,params_lambda_l1,params_lambda_l2,params_max_cat_threshold,params_min_data_in_leaf,params_min_data_per_group,params_num_iterations,params_num_leaves,state
124,124,0.423844,2024-11-01 13:08:47.233570,2024-11-01 13:13:27.343697,0 days 00:04:40.110127,0.934703,7.462861,0.685297,6.201879e-06,6.872642e-06,118,264,975,3799,48,COMPLETE
123,123,0.424213,2024-11-01 13:03:33.751357,2024-11-01 13:08:47.189104,0 days 00:05:13.437747,0.937987,13.85693,0.733177,2.776588e-05,1.266775e-06,106,264,995,3766,56,COMPLETE
53,53,0.424406,2024-11-01 03:02:46.182428,2024-11-01 03:10:04.578199,0 days 00:07:18.395771,0.936027,1.387731e-07,0.727842,8.347988e-05,1.888569e-07,362,286,914,2332,146,COMPLETE
138,138,0.424493,2024-11-01 14:27:13.532986,2024-11-01 14:33:20.423936,0 days 00:06:06.890950,0.918853,5.933026,0.691617,2.010683e-07,8.822117e-06,122,344,973,3684,71,COMPLETE
74,74,0.424549,2024-11-01 05:45:28.355721,2024-11-01 05:58:16.083546,0 days 00:12:47.727825,0.996513,4.073827e-07,0.780628,0.000114435,6.710521e-07,271,248,446,4445,139,COMPLETE
135,135,0.424562,2024-11-01 14:08:52.176004,2024-11-01 14:15:34.458264,0 days 00:06:42.282260,0.940228,0.2908894,0.694536,3.600553e-07,2.449112e-06,126,382,978,3530,84,COMPLETE
51,51,0.424636,2024-11-01 02:47:56.478142,2024-11-01 02:55:34.820048,0 days 00:07:38.341906,0.934347,7.079988e-08,0.769929,0.0002667461,5.837647e-06,219,273,922,2398,148,COMPLETE
44,44,0.424866,2024-11-01 02:09:35.105494,2024-11-01 02:14:10.225718,0 days 00:04:35.120224,0.999468,9.666357e-07,0.76608,0.001980661,6.139017e-06,253,245,893,1402,150,COMPLETE
15,15,0.424904,2024-10-31 21:40:27.850409,2024-10-31 21:49:54.717441,0 days 00:09:26.867032,0.941103,2.956852e-07,0.754993,7.482252e-05,1.306976e-06,52,312,171,4306,105,COMPLETE
111,111,0.42503,2024-11-01 11:26:10.417750,2024-11-01 11:39:17.200865,0 days 00:13:06.783115,0.930807,2.136565e-06,0.807191,0.006082093,1.336967e-08,71,300,915,3750,167,COMPLETE


In [10]:
plot_optimization_history(study)

In [11]:
plot_param_importances(study)

In [12]:
plot_slice(study)


In [13]:
plot_edf(study)


In [14]:
plot_parallel_coordinate(study)


In [15]:
best_params = dict(study.best_params)
best_params = {**fixed_params, **best_params}
best_params

{'objective': 'regression',
 'metric': 'rmse',
 'boosting_type': 'gbdt',
 'learning_rate': 0.05,
 'bagging_freq': 1,
 'verbose': -1,
 'seed': 2112,
 'num_leaves': 48,
 'lambda_l1': 6.201879103615396e-06,
 'lambda_l2': 6.872642347262283e-06,
 'feature_fraction': 0.6852973167416573,
 'bagging_fraction': 0.9347031181064782,
 'num_iterations': 3799,
 'min_data_in_leaf': 264,
 'min_data_per_group': 975,
 'max_cat_threshold': 118,
 'cat_l2': 7.46286126946506}

***