In [1]:
from sklearn.model_selection import train_test_split

from pytorch_tabular.utils import load_covertype_dataset

In [2]:
data, cat_col_names, num_col_names, target_col = load_covertype_dataset()

# Importing the Library

In [3]:
from pytorch_tabular import TabularModel, model_sweep
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig,
    DANetConfig,
    GANDALFConfig,
    FTTransformerConfig,
    TabNetModelConfig
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig


In [4]:
train, test = train_test_split(data, random_state=42)


## Model Sweep

Define the data config, trainer config, and optimizer config and do a sweep of multiple models.

In [5]:
data_config = DataConfig(
    target=[
        target_col
    ],  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    batch_size=1024,
    max_epochs=25,
    auto_lr_find=True,
    early_stopping="valid_loss",  # Monitor valid_loss for early stopping
    early_stopping_mode="min",  # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=5,  # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss",  # Save best checkpoint monitoring val_loss
    load_best=True,  # After training, load the best checkpoint
    progress_bar="none",  # Turning off Progress bar
    trainer_kwargs=dict(enable_model_summary=False),  # Turning off model summary
    accelerator="cpu",
    fast_dev_run=False,
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="", dropout=0.1, initialization="kaiming"  # No additional layer in head, just a mapping layer to output_dim
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)


## Model Sweep API

The model sweep enables you to quickly sweep thorugh different models and configurations. It takes in a list of model configs or one of the presets defined in ``pytorch_tabular.MODEL_PRESETS`` and trains them on the data. It then ranks the models based on the metric provided and returns the best model.

These are the major args:
- ``task``: The type of prediction task. Either 'classification' or 'regression'
- ``train``: The training data
- ``test``: The test data on which performance is evaluated
- all the config objects can be passed as either the object or the path to the yaml file.
- ``models``: The list of models to compare. This can be one of the presets defined in ``pytorch_tabular.MODEL_SWEEP_PRESETS`` or a list of ``ModelConfig`` objects.
- ``metrics``: the list of metrics you need to track during training. The metrics should be one of the functional metrics implemented in ``torchmetrics``. By default, it is accuracy if classification and mean_squared_error for regression
- ``metrics_prob_input``: Is a mandatory parameter for classification metrics defined in the config. This defines whether the input to the metric function is the probability or the class. Length should be same as the number of metrics. Defaults to None.
- ``metrics_params``: The parameters to be passed to the metrics function. 
- ``rank_metric``: The metric to use for ranking the models. The first element of the tuple is the metric name and the second element is the direction. Defaults to ('loss', "lower_is_better").
- ``return_best_model``: If True, will return the best model. Defaults to True.

In [6]:
from pytorch_tabular import MODEL_SWEEP_PRESETS
MODEL_SWEEP_PRESETS.keys()

dict_keys(['lite', 'full', 'high_memory'])

In [8]:
sweep_df, best_model = model_sweep(
    task="classification",  # One of "classification", "regression"
    train=train,
    test=test,
    data_config=data_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    model_list="lite",
    common_model_args=dict(head="LinearHead", head_config=head_config),
    metrics=['accuracy', "f1_score"],
    metrics_params=[{}, {"average": "weighted"}],
    metrics_prob_input=[False, True],
    rank_metric=("accuracy", "higher_is_better"),
    progress_bar=True,
    verbose=False
)

Output()

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
sweep_df.drop(columns=["params", "time_taken", "epochs"]).style.highlight_max(
    subset=["test_accuracy", "test_f1_score"], color="lightgreen"
).highlight_min(subset=["test_loss"], color="lightgreen")

Unnamed: 0,model,# Params,test_loss,test_accuracy,test_f1_score,time_taken_per_epoch
2,GANDALFModel,43 T,0.194132,0.923134,0.922943,9.188667
3,TabNetModel,50 T,0.269097,0.891575,0.891314,14.920577
0,CategoryEmbeddingModel,51 T,0.273254,0.889448,0.888622,7.662767
1,DANetModel,78 T,0.328478,0.865359,0.862838,55.02135


We chose the `lite` preset which is a set of four models which have comparable # of params and trains relatively faster with less memory requirements.

We can see that GANDALF performs the best in terms of accuracy, loss and f1 score. We can also see that the training time is comparable to regular MLP. A natural next step would be to tune the model a but more and find the best parameters.