In [19]:
import uuid
from pathlib import Path
import requests
import pandas as pd
import joblib
import numpy as np
import torch
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models import GatedAdditiveTreeEnsembleConfig
from pytorch_tabular import TabularModel

In [13]:
config = {
        "dataset": "Ailerons",
        "n_trials": 100,
        "batch_size": 512,
        "max_epochs": 100,
        "early_stopping_patience": 10,
        "optimizer": "AdamW",
        "strategy": "tpe",
        "pruning": False,
    }

In [14]:
DATA_DIR = Path("data")

In [15]:
data_path = DATA_DIR / config["dataset"]
d_config_files = list(data_path.glob("*config*"))
d_config = np.load(d_config_files[0], allow_pickle=True).item()
task = "classification" if d_config["regression"] == 0 else "regression"
n_folds = len(d_config_files)

In [17]:
n_features = np.load(
    data_path / "x_test_fold_0.npy", allow_pickle=True
).shape[1]
cat_col_names = None
num_col_names = [f"feature_{i}" for i in range(n_features)]

In [30]:
data_config = DataConfig(
    target=["target"],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names or [],
    normalize_continuous_features=True,
)
trainer_config = TrainerConfig(
    auto_lr_find=False,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=config["batch_size"],
    max_epochs=config["max_epochs"],
    early_stopping="valid_loss",
    early_stopping_mode="min",  # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=config[
        "early_stopping_patience"
    ],  # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss",
    load_best=True,  # After training, load the best checkpoint
    # progress_bar="none",  # Turning off Progress bar
    # trainer_kwargs=dict(enable_model_summary=False),  # Turning off model summary
)
optimizer_config = OptimizerConfig(
    optimizer=config["optimizer"], optimizer_params={"weight_decay": 1e-7}
)
model_config = GatedAdditiveTreeEnsembleConfig(
    task=task,
    learning_rate=0.001,
    metrics=["r2_score"] if task == "regression" else None,
    metrics_prob_input=[False] if task == "regression" else None,
    gflu_stages=17,
    gflu_dropout=0.17,
    gflu_feature_init_sparsity=0.21,
    learnable_sparsity=True,
)

In [37]:



metrics = []
for fold in range(n_folds):
    x_train = np.load(
        DATA_DIR/"Ailerons" / f"x_train_fold_{fold}.npy", allow_pickle=True
    )
    y_train = np.load(
        DATA_DIR/"Ailerons" / f"y_train_fold_{fold}.npy", allow_pickle=True
    ).reshape(-1, 1)
    x_val = np.load(
        DATA_DIR/"Ailerons" / f"x_val_fold_{fold}.npy", allow_pickle=True
    )
    y_val = np.load(
        DATA_DIR/"Ailerons" / f"y_val_fold_{fold}.npy", allow_pickle=True
    ).reshape(-1, 1)
    x_test = np.load(
        DATA_DIR/"Ailerons" / f"x_train_fold_{fold}.npy", allow_pickle=True
    )
    y_test = np.load(
        DATA_DIR/"Ailerons" / f"y_train_fold_{fold}.npy", allow_pickle=True
    ).reshape(-1, 1)
    # combine x and y into a dataframe
    train = pd.DataFrame(
        np.concatenate([x_train, y_train], axis=1),
        columns=[f"feature_{i}" for i in range(x_train.shape[1])]
        + ["target"],
    )
    val = pd.DataFrame(
        np.concatenate([x_val, y_val], axis=1),
        columns=[f"feature_{i}" for i in range(x_val.shape[1])] + ["target"],
    )
    test = pd.DataFrame(
        np.concatenate([x_test, y_test], axis=1),
        columns=[f"feature_{i}" for i in range(x_test.shape[1])] + ["target"],
    )
    # Initialize the tabular model
    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    try:
        # Fit the model
        tabular_model.fit(
            train=train,
            validation=val,
            target_transform=(lambda x: x*1000, lambda x: x/1000)
            # callbacks=[
            #     PyTorchLightningPruningCallback(trial, monitor="valid_loss")
            # ],
        )
        result = tabular_model.evaluate(test, verbose=False)
        metrics.append(result[0]["test_r2_score"])
    except RuntimeError as e:
        print(e)
        # gc.collect()
        torch.cuda.empty_cache()
        metrics.append(0)
    break

2023-06-11 13:34:47,963 - {pytorch_tabular.tabular_model:105} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-06-11 13:34:47,983 - {pytorch_tabular.tabular_model:473} - INFO - Preparing the DataLoaders
2023-06-11 13:34:47,984 - {pytorch_tabular.tabular_datamodule:290} - INFO - Setting up the datamodule for regression task
2023-06-11 13:34:48,002 - {pytorch_tabular.tabular_model:521} - INFO - Preparing the Model: GatedAdditiveTreeEnsembleModel
2023-06-11 13:34:48,045 - {pytorch_tabular.models.gate.gate_model:282} - INFO - Data Aware Initialization of T0
2023-06-11 13:34:48,067 - {pytorch_tabular.tabular_model:268} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-06-11 13:34:48,098 - {pytorch_tabular.tabular_model:582} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is 

Output()

2023-06-11 13:39:15,537 - {pytorch_tabular.tabular_model:584} - INFO - Training the model completed
2023-06-11 13:39:15,538 - {pytorch_tabular.tabular_model:1258} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

In [38]:
result[0]

{'test_loss': 0.02455640397965908, 'test_r2_score': 0.8540627956390381}

In [36]:
(train['target']*1000).mean()

-0.87528306