In [1]:
import numpy as np
from sklearn import datasets
import joplen as jp
from enums import *
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from itertools import product
from ax import optimize
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from data_tools import WhiteWineQuality
from pathlib import Path
from chem_data import *
from copy import copy, deepcopy
import yaml
import time
from pprint import pprint

CACHE_DIR = Path("cache") / "runs"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

DS_PATH = Path(".")


In [2]:
# for some reason, these are necessary for the case statement to work
import lightgbm as lgbm
import sklearn.ensemble as ske
import xgboost as xgb


In [3]:
def ttv_split(x, y, seed, indent=False):
    np.random.seed(seed)

    # Split data into train validation and test sets
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        test_size=0.2,
        shuffle=True,
    )
    x_train, x_val, y_train, y_val = train_test_split(
        x_train,
        y_train,
        test_size=0.25,
        shuffle=True,
    )

    res = {
        "fraction": [
            float(_x.shape[0] / x.shape[0]) for _x in [x_train, x_val, x_test]
        ],
        "size": [int(_x.shape[0]) for _x in [x_train, x_val, x_test]],
        "features": int(x_train.shape[1]),
    }

    pprint(res)

    return x_train, x_val, x_test, y_train, y_val, y_test, res


In [4]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)


In [5]:
def dummy_regressor(x_train, x_val, x_test, y_train, y_val, y_test, indent=False):
    dummy = DummyRegressor(strategy="mean")
    dummy.fit(x_train, y_train)
    y_pred = dummy.predict(x_test)

    res = {
        "model_name": dummy.__class__.__name__,
        "rmse": float(rmse(y_test, y_pred)),
    }

    pprint(res)
    return res


In [6]:
et_params = [
    {
        "name": "max_leaf_nodes",
        "type": "range",
        "bounds": [2, 32],
        "value_type": "int",
    },
    {
        "name": "n_estimators",
        "type": "range",
        "bounds": [10, 1000],
        "value_type": "int",
        "log_scale": True,
    },
    {
        "name": "min_samples_leaf",
        "type": "range",
        "bounds": [1, 32],
        "value_type": "int",
    },
    {
        "name": "min_impurity_decrease",
        "type": "range",
        "bounds": [0.0, 1.0],
        "value_type": "float",
    },
    {
        "name": "random_state",
        "type": "fixed",
        "value": 0,
        "value_type": "int",
    },
    {
        "name": "criterion",
        "type": "fixed",
        "value": "squared_error",
        "value_type": "str",
    },
    {
        "name": "max_features",
        "type": "fixed",
        "value": 1,
        "value_type": "int",
    },
]

lgbm_params = [
    {
        "name": "num_leaves",
        "type": "range",
        "bounds": [2, 32],
        "value_type": "int",
    },
    {
        "name": "n_estimators",
        "type": "range",
        "bounds": [10, 1000],
        "value_type": "int",
        "log_scale": True,
    },
    {
        "name": "learning_rate",
        "type": "range",
        "bounds": [1e-5, 1e-1],
        "value_type": "float",
    },
    {
        "name": "min_split_gain",
        "type": "range",
        "bounds": [0.0, 1.0],
        "value_type": "float",
    },
    {
        "name": "reg_alpha",
        "type": "range",
        "bounds": [1e-5, 1e1],
        "value_type": "float",
        "log_scale": True,
    },
    {
        "name": "reg_lambda",
        "type": "range",
        "bounds": [1e-5, 1e1],
        "value_type": "float",
        "log_scale": True,
    },
    {
        "name": "random_state",
        "type": "fixed",
        "value": 0,
        "value_type": "int",
    },
    {
        "name": "verbose",
        "type": "fixed",
        "value": -1,
        "value_type": "int",
    },
]

xgb_params = [
    {
        "name": "max_leaves",
        "type": "range",
        "bounds": [2, 32],
        "value_type": "int",
    },
    {
        "name": "n_estimators",
        "type": "range",
        "bounds": [10, 1000],
        "value_type": "int",
        "log_scale": True,
    },
    {
        "name": "learning_rate",
        "type": "range",
        "bounds": [1e-5, 1e-1],
        "value_type": "float",
    },
    {
        "name": "grow_policy",
        "type": "choice",
        "values": ["depthwise", "lossguide"],
        "value_type": "str",
    },
    {
        "name": "booster",
        "type": "choice",
        "values": ["gbtree", "dart"],
        "value_type": "str",
    },
    {
        "name": "gamma",
        "type": "range",
        "bounds": [1e-5, 1e1],
        "value_type": "float",
        "log_scale": True,
    },
    {
        "name": "random_state",
        "type": "fixed",
        "value": 0,
        "value_type": "int",
    },
    {
        "name": "tree_method",
        "type": "fixed",
        "value": "approx",
        "value_type": "str",
    },
]
joplen_params = [
    {
        "name": "n_cells",
        "type": "range",
        "bounds": [2, 32],
        "value_type": "int",
    },
    {
        "name": "n_partitions",
        "type": "range",
        "bounds": [10, 1000],
        "value_type": "int",
        "log_scale": True,
    },
    {
        "name": "lam",
        "type": "range",
        "bounds": [0.0, 2.0],
        "value_type": "float",
    },
    {
        "name": "mu",
        "type": "range",
        "bounds": [1e-5, 1e-1],
        "value_type": "float",
        "log_scale": True,
    },
    {
        "name": "alpha",
        "type": "range",
        "bounds": [1e-5, 1e1],
        "value_type": "float",
        "log_scale": True,
    },
    {
        "name": "random_state",
        "type": "fixed",
        "value": 0,
        "value_type": "int",
    },
    {
        "name": "partitioner",
        "type": "fixed",
        "value": "jp.VPartition",
        "value_type": "str",
    },
    {
        "name": "cell_model",
        "type": "fixed",
        "value": "CellModel.linear",
        "value_type": "str",
    },
    {
        "name": "max_iters",
        "type": "fixed",
        "value": 10000,
        "value_type": "int",
    },
    {
        "name": "norm_type",
        "type": "fixed",
        "value": "NormType.L21",
        "value_type": "str",
    },
    {
        "name": "verbose",
        "type": "fixed",
        "value": False,
        "value_type": "bool",
    },
]


In [7]:
# training functions


def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time

        return (result, start_time, end_time, elapsed_time)

    return wrapper


@timer_decorator
def train_er(
    params,
    x_train,
    y_train,
    x_val,
    y_val,
    x_test=None,
    y_test=None,
):
    er = ExtraTreesRegressor(**params)
    er.fit(x_train, y_train.flatten())

    val_error = float(rmse(y_val.flatten(), er.predict(x_val)))

    if x_test is not None and y_test is not None:
        test_error = float(rmse(y_test.flatten(), er.predict(x_test)))
        return val_error, test_error, er
    else:
        return val_error, er


@timer_decorator
def train_lgbm(params, x_train, y_train, x_val, y_val, x_test=None, y_test=None):
    lgbm = LGBMRegressor(**params)
    lgbm.fit(
        x_train,
        y_train.flatten(),
        # TODO: Need to re-enable validation set
        # eval_set=[(x_val, y_val.flatten())],
        # verbose=-1,
        # callbacks=[],
    )

    val_error = float(rmse(y_val.flatten(), lgbm.predict(x_val)))

    if x_test is not None and y_test is not None:
        test_error = float(rmse(y_test.flatten(), lgbm.predict(x_test)))
        return val_error, test_error, lgbm
    else:
        return val_error, lgbm


@timer_decorator
def train_xgboost(params, x_train, y_train, x_val, y_val, x_test=None, y_test=None):
    xgb = XGBRegressor(**params)
    xgb.fit(
        x_train,
        y_train.flatten(),
        # TODO: Need to re-enable validation set
        # eval_set=[(x_val, y_val.flatten())],
    )

    val_error = float(rmse(y_val.flatten(), xgb.predict(x_val)))

    if x_test is not None and y_test is not None:
        test_error = float(rmse(y_test.flatten(), xgb.predict(x_test)))
        return val_error, test_error, xgb
    else:
        return val_error, xgb


@timer_decorator
def train_joplen(
    params,
    x_train,
    y_train,
    x_val,
    y_val,
    x_test=None,
    y_test=None,
):
    joplen = jp.JOPLEn(
        partitioner=eval(params["partitioner"]),
        n_cells=params["n_cells"],
        n_partitions=params["n_partitions"],
        random_state=params["random_state"],
        cell_model=eval(params["cell_model"]),
    )
    joplen.fit(
        x_train,
        y_train,
        val_x=x_val,
        val_y=y_val,
        max_iters=params["max_iters"],
        norm_type=eval(params["norm_type"]),
        verbose=params["verbose"],
        mu=params["mu"],
        lam=params["lam"],
        alpha=params["alpha"],
    )

    val_error = float(rmse(y_val, joplen.predict(x_val)))

    if x_test is not None and y_test is not None:
        test_error = float(rmse(y_test, joplen.predict(x_test)))
        return val_error, test_error, joplen
    else:
        return val_error, joplen


In [8]:
from ax.service.ax_client import AxClient, ObjectiveProperties


def optimize_model(model_class, ds_class, n_trials, minimize, loss_type):
    ds = ds_class(DS_PATH)
    print("\n\n" + ds.name)
    x, y = ds.get_data()

    x_train, x_val, x_test, y_train, y_val, y_test, ttv_info = ttv_split(
        x, y, 0, indent=True
    )

    dummy_info = dummy_regressor(
        x_train, x_val, x_test, y_train, y_val, y_test, indent=True
    )

    match model_class:
        case jp.JOPLEn:
            params = joplen_params
            train_fn = train_joplen
        case lgbm.LGBMRegressor:
            params = lgbm_params
            train_fn = train_lgbm
        case ske.ExtraTreesRegressor:
            params = et_params
            train_fn = train_er
        case xgb.XGBRegressor:
            params = xgb_params
            train_fn = train_xgboost
        case _:
            raise ValueError("Model not supported.")

    exp_name = model_class.__name__ + "_" + ds_class.__name__
    exp_path = Path("ax_runs") / f"{exp_name}.json"

    ax_client = AxClient(random_seed=0)

    ax_client.create_experiment(
        name=exp_name,
        parameters=params,
        objectives={loss_type: ObjectiveProperties(minimize=minimize)},
        overwrite_existing_experiment=True,
    )

    for _ in range(n_trials):
        round_params, trial_index = ax_client.get_next_trial()
        try:
            val_error, _ = train_fn(
                round_params, x_train=x_train, y_train=y_train, x_val=x_val, y_val=y_val
            )[0]
            ax_client.complete_trial(trial_index=trial_index, raw_data=float(val_error))
        except ValueError as e:
            print(e)
            ax_client.abandon_trial(
                trial_index=trial_index,
                reason=str(e),
            )

    best_parameters, values = ax_client.get_best_parameters()

    (val_error, test_error, _), _, _, train_time = train_fn(
        best_parameters,
        x_train=x_train,
        y_train=y_train,
        x_val=x_val,
        y_val=y_val,
        x_test=x_test,
        y_test=y_test,
    )

    exp_path.parent.mkdir(parents=True, exist_ok=True)

    ax_client.save_to_json_file(
        filepath=exp_path,
    )

    return {
        "model_name": model_class.__name__,
        "val_score": val_error,
        "test_score": test_error,
        "train_time": train_time,
        "params": best_parameters,
        "dummy_loss": dummy_info["rmse"],
    }


In [None]:
res = optimize_model(jp.JOPLEn, NPLogP, 10, True, "rmse")
res


In [None]:
res = optimize_model(ske.ExtraTreesRegressor, NPLogP, 10, True, "rmse")
res


In [45]:
res = optimize_model(XGBRegressor, NPLogP, 10, True, "rmse")
res


[INFO 10-10 17:46:07] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
  warn(
  warn(
  warn(
  warn(
[INFO 10-10 17:46:07] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='max_leaves', parameter_type=INT, range=[2, 32]), RangeParameter(name='n_estimators', parameter_type=INT, range=[10, 1000], log_scale=True), RangeParameter(name='learning_rate', parameter_type=FLOAT, range=[1e-05, 0.1]), ChoiceParameter(name='grow_policy', parameter_type=STRING, values=['depthwise', 'lossguide'], is_ordered=False, sort_values=False), ChoiceParameter(name='booster', parameter_type=STRING, values=['gbtree', 'dart'], is_ordered=False, sort_values=False), RangeParameter(name='gamma', parameter_type=FLOAT, range=[1e-05, 10.0], log_scale=True), FixedParameter(name='random_state', parameter_type=INT, value=0

NP LogP
{'features': 1704,
 'fraction': [0.5918367346938775, 0.20408163265306123, 0.20408163265306123],
 'size': [87, 30, 30]}
{'model_name': 'DummyRegressor', 'rmse': 1.7831159637044283}


[INFO 10-10 17:46:23] ax.service.ax_client: Completed trial 0 with data: {'rmse': (0.991298, None)}.
[INFO 10-10 17:46:23] ax.service.ax_client: Generated new trial 1 with parameters {'max_leaves': 21, 'n_estimators': 80, 'learning_rate': 0.08649, 'gamma': 0.083746, 'grow_policy': 'lossguide', 'booster': 'dart', 'random_state': 0, 'tree_method': 'approx'}.
[INFO 10-10 17:46:35] ax.service.ax_client: Completed trial 1 with data: {'rmse': (1.006066, None)}.
[INFO 10-10 17:46:35] ax.service.ax_client: Generated new trial 2 with parameters {'max_leaves': 27, 'n_estimators': 574, 'learning_rate': 0.013206, 'gamma': 3.077612, 'grow_policy': 'depthwise', 'booster': 'dart', 'random_state': 0, 'tree_method': 'approx'}.


KeyboardInterrupt: 

In [23]:
datasets = [
    NPLogP,
    NPZetaP,
    ProtSol,
    MolLogP,
    MolHenry,
    MolBoil,
    MolMelt,
]

# joplen_res = {}
# er_res = {}
xgb_res = {}

model_types = [
    # jp.JOPLEn,
    # ske.ExtraTreesRegressor,
    xgb.XGBRegressor,
]
dicts = [
    # joplen_res,
    # er_res,
    xgb_res,
]

for ds in datasets:
    k = ds.__name__
    for model_type, d in zip(model_types, dicts):
        d[k] = optimize_model(
            model_class=model_type,
            ds_class=ds,
            n_trials=100,
            minimize=True,
            loss_type="rmse",
        )


[INFO 10-11 06:23:20] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
  warn(
  warn(
  warn(
  warn(
[INFO 10-11 06:23:20] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='max_leaves', parameter_type=INT, range=[2, 32]), RangeParameter(name='n_estimators', parameter_type=INT, range=[10, 1000], log_scale=True), RangeParameter(name='learning_rate', parameter_type=FLOAT, range=[1e-05, 0.1]), ChoiceParameter(name='grow_policy', parameter_type=STRING, values=['depthwise', 'lossguide'], is_ordered=False, sort_values=False), ChoiceParameter(name='booster', parameter_type=STRING, values=['gbtree', 'dart'], is_ordered=False, sort_values=False), RangeParameter(name='gamma', parameter_type=FLOAT, range=[1e-05, 10.0], log_scale=True), FixedParameter(name='random_state', parameter_type=INT, value=0



NP LogP
{'features': 1704,
 'fraction': [0.5918367346938775, 0.20408163265306123, 0.20408163265306123],
 'size': [87, 30, 30]}
{'model_name': 'DummyRegressor', 'rmse': 1.7831159637044283}


[INFO 10-11 06:25:00] ax.service.ax_client: Completed trial 0 with data: {'rmse': (0.991298, None)}.
[INFO 10-11 06:25:00] ax.service.ax_client: Generated new trial 1 with parameters {'max_leaves': 21, 'n_estimators': 80, 'learning_rate': 0.08649, 'gamma': 0.083746, 'grow_policy': 'lossguide', 'booster': 'dart', 'random_state': 0, 'tree_method': 'approx'}.
[INFO 10-11 06:26:26] ax.service.ax_client: Completed trial 1 with data: {'rmse': (1.006066, None)}.
[INFO 10-11 06:26:26] ax.service.ax_client: Generated new trial 2 with parameters {'max_leaves': 27, 'n_estimators': 574, 'learning_rate': 0.013206, 'gamma': 3.077612, 'grow_policy': 'depthwise', 'booster': 'dart', 'random_state': 0, 'tree_method': 'approx'}.
[INFO 10-11 06:33:42] ax.service.ax_client: Completed trial 2 with data: {'rmse': (1.074275, None)}.
[INFO 10-11 06:33:42] ax.service.ax_client: Generated new trial 3 with parameters {'max_leaves': 3, 'n_estimators': 10, 'learning_rate': 0.050871, 'gamma': 0.001162, 'grow_policy'

KeyboardInterrupt: 

In [20]:
{k: v["dummy_loss"] for k, v in joplen_res.items()}


{'NPLogP': 1.7831159637044283,
 'NPZetaP': 34.064111323553945,
 'ProtSol': 32.954534297678684,
 'MolLogP': 1.841579159435372,
 'MolHenry': 6.633168933233905,
 'MolBoil': 83.22852562904664,
 'MolMelt': 95.47062435805638}

In [16]:
{k: (v["val_score"], v["test_score"]) for k, v in joplen_res.items()}


{'NPLogP': (0.9692163508285933, 0.8475834631371792),
 'NPZetaP': (16.0458870062652, 23.03845015049339),
 'ProtSol': (30.360103381391813, 31.555057577931034),
 'MolLogP': (0.9351424854032594, 0.9479114131618289),
 'MolHenry': (3.540368616878593, 3.7528611248193915),
 'MolBoil': (62.961585482623335, 64.66564616385752),
 'MolMelt': (59.98263321353336, 66.2503041016325)}

In [17]:
{k: (v["val_score"], v["test_score"]) for k, v in er_res.items()}


{'NPLogP': (1.4865816261209421, 1.43066223189318),
 'NPZetaP': (20.430389546191787, 29.987465365315803),
 'ProtSol': (32.83985954195217, 32.599932984909216),
 'MolLogP': (1.7330388086119701, 1.736951246817223),
 'MolHenry': (5.8139450056906155, 5.706304226866421),
 'MolBoil': (78.39561956654113, 76.61976017902927),
 'MolMelt': (89.44327967046853, 84.64096679642299)}