# Testing Optuna

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 08/07/2025   | Martin | Create  | Notebook created to test optuna for hyperparameter search | 
| 09/07/2025   | Martin | Update  | Used Optuna to improve LightGBM and XGBoost models | 

# Content

* [Vanilla Optuna](#vanilla-optuna)
* [Optuna XGBoost](#optuna-xgboost)

In [1]:
%load_ext watermark

# Vanilla Optuna

In [1]:
import optuna
import lightgbm as lgb
import polars as pl
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

In [2]:
# Load data
f = pl.read_csv("data/clean/pca_170.csv")
f.head()

pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16,pca_17,pca_18,pca_19,pca_20,pca_21,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31,pca_32,pca_33,pca_34,pca_35,pca_36,pca_37,…,pca_136,pca_137,pca_138,pca_139,pca_140,pca_141,pca_142,pca_143,pca_144,pca_145,pca_146,pca_147,pca_148,pca_149,pca_150,pca_151,pca_152,pca_153,pca_154,pca_155,pca_156,pca_157,pca_158,pca_159,pca_160,pca_161,pca_162,pca_163,pca_164,pca_165,pca_166,pca_167,pca_168,pca_169,pca_170,timestamp,label
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64
-8.704309,5.45448,0.089441,-7.54084,4.768329,-5.736881,-0.231744,-1.788746,2.788266,-1.190636,3.148074,3.06826,6.837452,0.155804,-0.885875,-2.395842,3.13128,-4.266488,0.943026,0.60447,0.816785,-0.961067,-3.641479,-0.078829,-4.117839,4.6193,0.883802,-0.229885,0.600305,3.098696,-2.115549,1.951743,-0.103804,1.010359,1.52759,3.009872,0.625632,…,0.813653,1.265367,0.342502,1.045652,0.620349,-0.660641,0.201047,-0.643091,-0.105072,0.560754,-0.075368,1.723466,-0.617254,-1.380721,0.182322,-0.246119,-0.077405,0.11701,-1.024182,0.292968,-0.208723,-0.677946,0.70196,0.449714,-0.244457,-1.132325,0.164973,0.106318,0.376251,-0.74303,-0.336223,-1.084156,0.665739,-0.162322,0.858892,"""2023-03-01T00:00:00.000000000""",0.562539
-7.994975,5.527279,-0.093258,-4.78567,4.097361,-6.208308,0.004089,-0.355855,3.798541,0.215877,2.648574,3.351467,6.158131,0.254169,-3.347221,-2.473193,3.618805,-3.161974,3.224999,2.054594,-2.296999,0.672556,-1.778871,-1.52334,1.743752,4.333566,-0.902184,0.029043,0.229918,2.181485,0.296175,1.67046,-0.227987,1.403915,2.228776,2.469899,-0.592644,…,-0.004203,0.837101,-0.419395,1.042492,-0.153447,-0.506816,0.006238,-0.800141,0.721092,-0.757491,0.083177,1.365879,1.094788,0.487046,0.05352,-0.871274,0.149816,0.448931,0.054529,-0.014539,-0.008121,0.415441,-0.12583,0.734486,0.668983,-0.910026,0.093044,0.553648,1.33117,-0.631829,0.189037,-0.679587,-0.257418,0.150572,0.55328,"""2023-03-01T00:01:00.000000000""",0.533686
-8.26279,5.052642,-0.215007,-4.022985,3.920973,-5.534213,-0.192796,1.0202,3.404864,1.268972,2.083842,3.41862,5.318265,-0.448951,-3.372344,-2.696721,3.405499,-1.909767,3.035536,2.726676,-2.895715,0.675317,-2.672765,-1.462927,4.68873,3.224458,-1.63386,-0.420518,0.737464,1.673416,1.288468,1.626087,-0.205033,1.21317,2.613173,1.760396,-0.513233,…,0.739755,0.360366,0.232943,0.331362,0.253692,0.097951,-0.07237,-1.249928,-0.115931,-0.91212,-0.924596,2.581976,1.620534,0.208768,0.879783,-0.511462,0.159419,-0.455293,-0.267378,-0.10749,0.446583,0.192696,0.848871,0.237633,0.574078,-0.768855,0.269726,-0.37779,0.514883,-0.677345,-0.371701,-0.568106,0.161394,0.129997,0.53952,"""2023-03-01T00:02:00.000000000""",0.546505
-6.583707,7.511057,-1.011146,-5.043764,4.93578,-5.846506,0.945893,-1.987165,5.037395,-0.61052,3.062758,3.185566,6.651355,0.209246,-4.044023,-2.209972,2.891386,-3.855405,3.003505,0.468633,-1.329728,0.101761,-1.862309,-2.455378,2.926416,4.623982,-2.465403,-0.558694,1.26826,1.792994,1.559252,2.209627,-0.713931,1.75982,1.519249,3.20797,-0.787381,…,0.43262,0.456949,0.522051,1.032057,-0.9966,-1.544666,-0.467818,-0.923329,0.817991,-1.338488,-0.29068,1.344751,1.463578,-0.179989,0.981197,-0.598796,0.044524,0.330884,0.217893,-0.140025,-0.897465,0.557288,0.562866,0.19119,0.245437,-0.888977,0.450078,0.407653,0.481479,0.334516,-0.449422,-0.513091,0.035653,0.093354,0.440441,"""2023-03-01T00:03:00.000000000""",0.357703
-6.496762,7.126301,-0.582182,-5.279062,4.686467,-6.359052,0.802733,-0.713604,4.994975,0.220712,3.183836,2.947917,5.852149,-0.284797,-3.676434,-2.022073,2.813884,-2.542805,3.164651,2.014087,-1.716045,0.661284,-1.617872,-1.461303,2.399019,3.573622,-2.939042,-0.593127,1.061588,1.461763,1.08091,1.564789,-0.689344,1.887824,1.781988,2.113466,-1.470044,…,0.082776,0.211711,0.28985,1.328555,-0.475901,-1.322235,-0.234092,-0.81018,0.251531,-0.984722,-0.227,0.931093,0.977268,0.032714,0.881645,-1.053385,0.202635,0.38637,0.169513,-0.381335,-0.752441,0.279883,0.347725,0.548521,0.39888,-0.798777,0.355318,0.517521,0.384691,0.281629,-0.30855,-0.363938,-0.269014,0.295612,0.304394,"""2023-03-01T00:04:00.000000000""",0.362452


In [3]:
# Process data
df = f.drop('timestamp')
y = df['label'].to_numpy()
X = df.drop('label').to_numpy()

In [4]:
# Define Optuna hyperparameter search
def objective(trial):
  num_test = int(X.shape[0] * 0.2)
  train_X, train_y = X[:-num_test], y[:-num_test]
  valid_X, valid_y = X[-num_test:], y[-num_test:]

  dtrain = lgb.Dataset(train_X, label=train_y)

  params = {
    "objective": "regression",
    "metric": trial.suggest_categorical("metric", ['rmse', 'l1', 'l2']),
    "verbosity": -1,
    "boosting_type": "gbdt",
    "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
    "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
    "num_leaves": trial.suggest_int("num_leaves", 2, 256),
    "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
    "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
  }

  gbm = lgb.train(params, dtrain)
  preds = gbm.predict(valid_X)
  correlation = pearsonr(preds, valid_y).statistic
  return correlation

In [5]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-07-09 23:09:00,524] A new study created in memory with name: no-name-67e450d6-ea44-47ed-a1ee-855488ec1813
[I 2025-07-09 23:09:09,683] Trial 0 finished with value: 0.08413283199459322 and parameters: {'metric': 'l2', 'lambda_l1': 0.0002477034873880323, 'lambda_l2': 0.00018793525532632586, 'num_leaves': 197, 'feature_fraction': 0.7370702857559748, 'bagging_fraction': 0.5121288299353159, 'bagging_freq': 4, 'min_child_samples': 45}. Best is trial 0 with value: 0.08413283199459322.
[I 2025-07-09 23:09:14,652] Trial 1 finished with value: 0.0840769760896016 and parameters: {'metric': 'l1', 'lambda_l1': 0.0008557575405080566, 'lambda_l2': 2.6470771249965974, 'num_leaves': 172, 'feature_fraction': 0.5665313727728563, 'bagging_fraction': 0.7901199249805895, 'bagging_freq': 1, 'min_child_samples': 11}. Best is trial 0 with value: 0.08413283199459322.
[I 2025-07-09 23:09:20,600] Trial 2 finished with value: 0.07408543977002335 and parameters: {'metric': 'l2', 'lambda_l1': 1.22740626455299

Number of finished trials: 100
Best trial:
  Value: 0.10583593225687359
  Params: 
    metric: rmse
    lambda_l1: 7.828002649331311e-06
    lambda_l2: 2.497724573187604e-07
    num_leaves: 75
    feature_fraction: 0.45411258929581977
    bagging_fraction: 0.575905283009444
    bagging_freq: 4
    min_child_samples: 13


In [8]:
# Evaluation with optimal params
# train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2)

# # Other splitting method
# num_test = int(X.shape[0] * 0.2)
# train_X, train_y = X[:-num_test], y[:-num_test]
# valid_X, valid_y = X[-num_test:], y[-num_test:]

dtrain = lgb.Dataset(X, label=y)

params = {
  "objective": "regression",
  "metric": "rmse",
  "verbosity": -1,
  "boosting_type": "gbdt",
  "lambda_l1": 7.828002649331311e-06,
  "lambda_l2": 2.497724573187604e-07,
  "num_leaves": 75,
  "feature_fraction": 0.45411258929581977,
  "bagging_fraction": 0.575905283009444,
  "bagging_freq": 4,
  "min_child_samples": 13,
}

gbm = lgb.train(params, dtrain)
# preds = gbm.predict(valid_X)
# correlation = pearsonr(preds, valid_y).statistic

In [7]:
correlation

0.10583593225687357

In [9]:
test = pl.read_csv('data/clean/test_pca_170.csv')

In [10]:
preds = gbm.predict(test)



In [13]:
submission = pl.DataFrame({
  'ID': range(1, len(preds)+1),
  'prediction': preds
})
submission.write_csv('results/optuna.csv')

---

# Optuna XGBoost

In [25]:
import xgboost as xgb

In [39]:
def objective(trial):
  num_test = int(X.shape[0] * 0.2)
  train_X, train_y = X[:-num_test], y[:-num_test]
  valid_X, valid_y = X[-num_test:], y[-num_test:]

  dtrain = xgb.DMatrix(train_X, label=train_y)
  dvalid = xgb.DMatrix(valid_X, label=valid_y)

  params = {
    "tree_method": "gpu_hist",
    "booster": "dart",
    "sampling_method": "gradient_based",
    "n_estimators": trial.suggest_int('n_estimators', 16, 256),
    "lambda": trial.suggest_uniform('lambda', 0.01, 1),
    'alpha': trial.suggest_uniform('alpha', 0.01, 1),
    'eta': trial.suggest_categorical('eta', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
    'gamma': trial.suggest_categorical('gamma', [18, 19, 20, 21, 22, 23, 24, 25]),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
    'colsample_bynode': trial.suggest_categorical('colsample_bynode', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
    'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
    'min_child_weight': trial.suggest_int('min_child_weight', 8, 600),  
    'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6, 7]),  
    'subsample': trial.suggest_categorical('subsample', [0.5,0.6,0.7,0.8,1.0]),
    'random_state': 42
  }

  model = xgb.train(params, dtrain)
  preds = model.predict(dvalid)
  correlation = pearsonr(preds, valid_y).statistic
  return correlation

In [40]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-07-09 22:20:20,644] A new study created in memory with name: no-name-330aadf2-47cc-41d8-9151-e3601c4b5000
  "lambda": trial.suggest_uniform('lambda', 0.01, 1),
  'alpha': trial.suggest_uniform('alpha', 0.01, 1),

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "n_estimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)
[I 2025-07-09 22:20:21,764] Trial 0 finished with value: 0.02293350737754938 and parameters: {'n_estimators': 212, 'lambda': 0.8652467178859906, 'alpha': 0.8106355167640886, 'eta': 0.8, 'gamma': 21, 'learning_rate': 0.014, 'colsample_bytree': 0.6, 'colsample_bynode': 1.0, 'min_child_weight': 207, 'max_depth': 7, 'subsample': 0.7}. Best is trial 0 with value: 0.02293350737754938.

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "n_estimators" } are not used.

  b

Number of finished trials: 100
Best trial:
  Value: 0.06956173346966613
  Params: 
    n_estimators: 172
    lambda: 0.5833481728159042
    alpha: 0.8599063035373478
    eta: 0.7
    gamma: 20
    learning_rate: 0.008
    colsample_bytree: 0.8
    colsample_bynode: 0.4
    min_child_weight: 220
    max_depth: 6
    subsample: 0.7


In [2]:
%watermark

Last updated: 2025-07-08T21:35:04.500769+08:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 9.4.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 24.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit

