In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import json
import pickle

import lightgbm as lgb
print("Lightgbm version:", lgb.__version__)

import optuna
print("Optuna version:", optuna.__version__)

# local modules
import sys
sys.path.append("../src")
from preproc import process_train_data

***
### load and preprocess data

In [None]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
path_results = Path("../data/results")

# load data
df_train = pd.read_csv(path_raw / "train.csv")
df_test = pd.read_csv(path_raw / "test.csv")

df_train

In [None]:
df_train, numerical_cols, categorical_cols, encoder, scaler = process_train_data(
    df_train,
    scale=False,
    # numerical_cols=numerical_cols,
    # categorical_cols=categorical_cols,
    include_position_features=False,
    include_text_features=False,
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

***
### optimization



In [4]:
split_list = pickle.load(open('../data/splits/cv2_Game.pkl', 'rb'))

In [5]:
def train_and_score(numerical_cols, categorical_cols):

    # Define the parameters
    params = {
        'objective': "regression",
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'max_depth': 10,
        'learning_rate': 0.1,
        'reg_alpha': 1e-1,
        'reg_lambda': 1e-1,
        'feature_fraction': 0.8,
        'verbose': -1,
        'seed': 2112,
    }

    target = 'utility_agent1'
    oof_scores = []

    # Perform cross-validation
    for _, (train_index, val_index) in enumerate(split_list, 1):
        # Split the data
        X_train, X_val = df_train.iloc[train_index], df_train.iloc[val_index]
        y_train, y_val = X_train[target], X_val[target]
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(
            data=X_train[numerical_cols + categorical_cols],
            label=y_train,
            categorical_feature=categorical_cols,
            free_raw_data=True
        )
        val_data = lgb.Dataset(
            data=X_val[numerical_cols + categorical_cols],
            label=y_val,
            categorical_feature=categorical_cols,
            free_raw_data=True
        )

        # Train the model
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[val_data],
        )

        # predict on validation set
        y_pred = model.predict(X_val[numerical_cols + categorical_cols])
        y_pred = np.clip(y_pred, -1, 1)
        
        # Compute RMSE on scaled values
        rmse = np.sqrt(np.mean((y_pred - y_val) ** 2))
        oof_scores.append(rmse)

    return np.mean(oof_scores)


In [6]:
def objective(trial):
    # Feature selection
    selected_numerical = [col for col in numerical_cols if trial.suggest_int(f'use_{col}', 0, 1) == 1]
    selected_categorical = [col for col in categorical_cols if trial.suggest_int(f'use_{col}', 0, 1) == 1]

    if len(selected_numerical) + len(selected_categorical) == 0:
        return 1.
        
    # Call the train_and_score function with selected features
    score = train_and_score(selected_numerical, selected_categorical)
    return score

In [None]:
do_optimize = True
timeout = 3600 * 72

# Start with QMC for good coverage
study = optuna.create_study(
    study_name="optuna_lgbm",
    direction='minimize',
    storage='sqlite:///optuna_lgbm.db',
    load_if_exists=True,
    sampler=optuna.samplers.QMCSampler()
)

if do_optimize:
    study.optimize(objective, n_trials=1000)

    # Switch to TPE for refined search
    study = optuna.create_study(
        study_name="optuna_lgbm",
        direction='minimize', 
        storage='sqlite:///optuna_lgbm.db',
        load_if_exists=True,
        sampler=optuna.samplers.TPESampler(
            n_startup_trials=1,    # Increase random sampling at start
            n_ei_candidates=100,   # Consider more candidates
            multivariate=True,     # Enable multivariate sampling
            constant_liar=True     # Help with parallel optimization
        )
    )
    study.optimize(objective, n_trials=2000)

In [None]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

In [None]:
# Get the selected features from the best trial
best_params = study.best_trial.params

# Filter numerical and categorical features
selected_numerical = [col for col in numerical_cols if best_params.get(f'use_{col}', 0) == 1]
selected_categorical = [col for col in categorical_cols if best_params.get(f'use_{col}', 0) == 1]

# Combine selected features
selected_features = selected_numerical + selected_categorical

print("Selected features:")
print("len(numerical): ", len(selected_numerical), "/", len(numerical_cols))
print("len(categorical): ", len(selected_categorical), "/", len(categorical_cols))
print("len(selected_features): ", len(selected_features), "/", len(numerical_cols) + len(categorical_cols))


# Save selected features to a file
selected_features_dict = {
    "numerical": selected_numerical,
    "categorical": selected_categorical
}
with open('select_optuna_lgbm.json', 'w') as f:
    json.dump(selected_features_dict, f, indent=2)

print("Selected features have been saved to 'select_optuna_lgbm.json'")

***