In [31]:
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import expit
from lightgbm import LGBMRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate
from perpetual import PerpetualBooster

In [32]:
pd.set_option('display.max_rows', 1000)

In [33]:
!python --version

Python 3.9.20


In [34]:
from importlib.metadata import version

print(f"numpy: {version('numpy')}")
print(f"optuna: {version('optuna')}")
print(f"lightgbm: {version('lightgbm')}")
print(f"scikit-learn: {version('scikit-learn')}")
print(f"perpetual: {version('perpetual')}")

numpy: 1.26.4
optuna: 4.0.0
lightgbm: 4.5.0
scikit-learn: 1.3.2
perpetual: 0.4.9


In [35]:
df = sns.load_dataset("diamonds")

In [36]:
df.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [37]:
X = df.drop(columns=["price", "y"])
y = df["price"]

In [38]:
X.shape

(53940, 8)

In [39]:
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
z           float64
dtype: object

In [40]:
X.nunique()

carat      273
cut          5
color        7
clarity      8
depth      184
table      127
x          554
z          375
dtype: int64

In [41]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,2.75


In [42]:
cols = ['cut', 'color', 'clarity']
X[cols] = X[cols].astype('category')
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,2.75


In [43]:
seed = 42
n_estimators = 100
n_trials = 1

In [44]:
scoring = "neg_root_mean_squared_error"
metric_function = root_mean_squared_error
metric_name = "rmse"
LGBMBooster = LGBMRegressor
objective = "SquaredLoss"

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")

X_train.shape: (43152, 8)
X_test.shape: (10788, 8)


In [46]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,z
26546,2.01,Good,F,SI2,58.1,64.0,8.23,4.77
9159,1.01,Very Good,E,SI2,60.0,60.0,6.57,3.92
14131,1.1,Premium,H,VS2,62.5,58.0,6.59,4.1
15757,1.5,Good,E,SI2,61.5,65.0,7.21,4.42
24632,1.52,Very Good,G,VS1,62.1,57.0,7.27,4.53


In [47]:
best_cv_results = None
cv_results = None

def save_best_cv_results(study, trial):
    global best_cv_results
    if study.best_trial.number == trial.number:
        best_cv_results = cv_results

In [48]:
def objective_function(trial):
    global cv_results
    params = {
        'seed': seed,
        'verbosity': -1,
        'n_estimators': n_estimators,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-6, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 33),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
    }
    model = LGBMBooster(**params)
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring, return_train_score=True, return_estimator=True)
    return -1 * np.mean(cv_results['test_score'])

In [49]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction='minimize', sampler=sampler)

[I 2024-10-08 18:52:38,249] A new study created in memory with name: no-name-e0817de5-0a6e-479e-b19f-175b92e23e3c


In [50]:
%%time
study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])

[I 2024-10-08 18:52:49,076] Trial 0 finished with value: 2000.9810667524173 and parameters: {'learning_rate': 0.010253509690168494, 'min_split_gain': 0.5061576888752307, 'reg_alpha': 0.02465832945854911, 'reg_lambda': 0.003907967156822882, 'colsample_bytree': 0.3248149123539492, 'subsample': 0.32479561626896214, 'subsample_freq': 1, 'max_depth': 29, 'num_leaves': 616, 'min_child_samples': 71}. Best is trial 0 with value: 2000.9810667524173.


CPU times: total: 13.6 s
Wall time: 10.8 s


In [51]:
print(f"Number of finished trials: {len(study.trials)}")
print("Best trial:")
print(f"  Number: {study.best_trial.number}")
print(f"  Value: {study.best_trial.value}")
print("  Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

Number of finished trials: 1
Best trial:
  Number: 0
  Value: 2000.9810667524173
  Params: 
    learning_rate: 0.010253509690168494
    min_split_gain: 0.5061576888752307
    reg_alpha: 0.02465832945854911
    reg_lambda: 0.003907967156822882
    colsample_bytree: 0.3248149123539492
    subsample: 0.32479561626896214
    subsample_freq: 1
    max_depth: 29
    num_leaves: 616
    min_child_samples: 71


In [52]:
print(f"CV train scores: {-1 * best_cv_results['train_score']}")
print(f"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}")
print(f"CV test scores: {-1 * best_cv_results['test_score']}")
print(f"CV test scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}")

CV train scores: [2001.35132941 1985.85304801 1991.68952646 1994.88802646 1995.95054575]
CV train scores average : 1993.946495
CV test scores: [1946.69602094 2027.02009529 2008.70797558 2009.34186745 2013.1393745 ]
CV test scores average : 2000.981067


In [53]:
models = best_cv_results["estimator"]

In [54]:
for i, model in enumerate(models):
    y_pred = model.predict_proba(X_train) if metric_name == "log_loss" else model.predict(X_train)
    print(f"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}")

Model 0, train rmse: 1990.539574
Model 1, train rmse: 1994.155022
Model 2, train rmse: 1995.104672
Model 3, train rmse: 1997.787026
Model 4, train rmse: 1999.399974


In [55]:
for i, model in enumerate(models):
    y_pred = model.predict_proba(X_test) if metric_name == "log_loss" else model.predict(X_test)
    print(f"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

Model 0, test rmse: 1982.119395
Model 1, test rmse: 1984.355572
Model 2, test rmse: 1985.41774
Model 3, test rmse: 1988.498437
Model 4, test rmse: 1989.999609


In [56]:
if metric_name == "log_loss":
    y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)
else:
    y_pred = np.mean([model.predict(X_train) for model in models], axis=0)
print(f"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}")

Train rmse: 1995.166118


In [57]:
if metric_name == "log_loss":
    y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)
else:
    y_pred = np.mean([model.predict(X_test) for model in models], axis=0)
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

Test rmse: 1985.84433


In [58]:
model = PerpetualBooster(objective=objective)
model.fit(X_train, y_train, budget=1.0)

Categorical features: [1, 2, 3]
Mapping of categories: {'cut': ['nan', 'Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 'color': ['nan', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], 'clarity': ['nan', 'I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']}


PanicException: called `Option::unwrap()` on a `None` value

In [None]:
if metric_name == "log_loss":
    y_pred = expit(model.predict(X_test))
else:
    y_pred = np.round(expit(model.predict(X_test)))
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

In [33]:
df_trees = model.trees_to_dataframe()

In [None]:
df_trees.head(10)

In [59]:
models[4].booster_.trees_to_dataframe().head(100)

Unnamed: 0,tree_index,node_depth,node_index,left_child,right_child,parent_index,split_feature,split_gain,threshold,decision_type,missing_direction,missing_type,value,weight,count
0,0,1,0-S0,0-S2,0-S1,,x,105611000000.0,6.315,<=,left,,3933.27,0,11214
1,0,2,0-S2,0-S6,0-S5,0-S0,x,6684040000.0,5.515,<=,left,,3910.19,7332,7332
2,0,3,0-S6,0-S19,0-S25,0-S2,x,892303000.0,4.985,<=,left,,3903.84,5158,5158
3,0,4,0-S19,0-S36,0-S49,0-S6,x,53830100.0,4.625,<=,left,,3901.08,3641,3641
4,0,5,0-S36,0-S71,0-S56,0-S19,x,11867700.0,4.275,<=,left,,3900.21,2446,2446
5,0,6,0-S71,0-S100,0-S99,0-S36,x,902828.0,4.145,<=,left,,3898.74,469,469
6,0,7,0-S100,0-L0,0-L101,0-S71,x,38865.2,3.995,<=,left,,3898.26,221,221
7,0,8,0-L0,,,0-S100,,,,,,,3898.027548,96,96
8,0,8,0-L101,,,0-S100,,,,,,,3898.432303,125,125
9,0,7,0-S99,0-L72,0-L100,0-S71,x,39383.8,4.245,<=,left,,3899.18,248,248
