In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd

from pytorch_tabular.utils import make_mixed_dataset, print_metrics
%load_ext autoreload
%autoreload 2

In [2]:
# Load dataset
data, cat_col_names, num_col_names = make_mixed_dataset(task="classification", n_samples=10000, n_features=8, n_categories=4, weights=[0.8], random_state=42)

# Create a new, second target
data['second_target'] = 0
for c in cat_col_names:
    data.second_target += data[c]

# Correlate it to 1st target
data.second_target += (data.target == 'class_0')

# Create random discrete noise to make task non-trivial
random_noise = np.random.normal(0, 0.8, data.shape[0]).astype(int)
data.second_target = (data.second_target + random_noise).mod(3) 

# Now that dataset is complete, we can perform train/test split
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

In [3]:
data.target.value_counts(normalize=True)

target
class_0    0.7968
class_1    0.2032
Name: proportion, dtype: float64

In [4]:
data.second_target.value_counts(normalize=True)

second_target
2.0    0.3364
1.0    0.3354
0.0    0.3282
Name: proportion, dtype: float64

# Importing the Library

In [5]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [6]:
results = []

## Define the Configs


In [7]:
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss", # Monitor valid_loss for early stopping
    early_stopping_mode = "min", # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=5, # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss", # Save best checkpoint monitoring val_loss
    load_best=True, # After training, load the best checkpoint
#     accelerator="cpu"
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
    learning_rate = 1e-3,
    metrics=["f1_score","accuracy","auroc"], 
    metrics_prob_input=[True, False, True] # f1_score needs probability scores, while accuracy doesn't
)

Data Config For Each Target

In [8]:
data_config_first_target = DataConfig(
    target=['target'], #target should always be a list
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names
)

data_config_second_target = DataConfig(
    target=['second_target'], #target should always be a list
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names
)

## Training the Single-Target Model 

In [9]:
tabular_model = TabularModel(
    data_config=data_config_first_target,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val)

result = tabular_model.evaluate(test)

result = {k: float(v) for k,v in result[0].items()}
result = pd.DataFrame({'f1':result['test_f1_score'],'auroc':result['test_auroc']},
             index=['1st Target (single mode)'])
results.append(result)

Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /Users/yony/Code/pytorch-tabular-public/docs/tutorials/saved_models exists and is not empty.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_co

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 92 steps due to diverging loss.
Learning rate set to 0.025118864315095822
Restoring states from the checkpoint path at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_744b05b3-1568-4cf0-97ac-eaefc0c795f4.ckpt
Restored all states from the checkpoint at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_744b05b3-1568-4cf0-97ac-eaefc0c795f4.ckpt


Output()

Output()

/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


### Now train separately on the 2nd target

In [10]:
tabular_model = TabularModel(
    data_config=data_config_second_target,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val)

result = tabular_model.evaluate(test)

result = {k: float(v) for k,v in result[0].items()}
result = pd.DataFrame({'f1':result['test_f1_score'],'auroc':result['test_auroc']},
             index=['2nd Target (single mode)'])

results.append(result)

Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /Users/yony/Code/pytorch-tabular-public/docs/tutorials/saved_models exists and is not empty.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_co

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 91 steps due to diverging loss.
Learning rate set to 0.0002511886431509582
Restoring states from the checkpoint path at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_cd3e2f01-7f2a-4103-ac17-9b4b4f8aeed1.ckpt
Restored all states from the checkpoint at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_cd3e2f01-7f2a-4103-ac17-9b4b4f8aeed1.ckpt


Output()

Output()

/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


## Multi-Target Training

Instead of training one model for the first target, and another model for the second target, we can train a single model that will make a prediction for both targets.

This is usually beneficial in reducing training time, but may also lead to better results since the model may have a better representation (embedding) by learning from multiple targets.

To perform multi-target training, we only need to model the 'target' field in the data_config to include a list of multiple targets.

Results are reported on the sum of all metrics (f1, AU-ROC, etc.), as well as a list of results for each target with the suffix '_n' (starting at n=1), for example, the f1 score for the 2nd target is test_f1_score_0.

In [11]:
data_config_multi = DataConfig(
    target=['target','second_target'], #target should always be a list
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names
)

tabular_model = TabularModel(
    data_config=data_config_multi,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val)

result = tabular_model.evaluate(test)

result = {k: float(v) for k,v in result[0].items()}
result1 = pd.DataFrame({'f1':result['test_f1_score_0'],'auroc':result['test_auroc_0']},
             index=['1st Target (multi-target mode)'])
results.append(result1)
result2 = pd.DataFrame({'f1':result['test_f1_score_1'],'auroc':result['test_auroc_1']},
             index=['2nd Target (multi-target mode)'])
results.append(result2)


Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /Users/yony/Code/pytorch-tabular-public/docs/tutorials/saved_models exists and is not empty.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_co

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 94 steps due to diverging loss.
Learning rate set to 4.786300923226385e-05
Restoring states from the checkpoint path at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_90994562-d7c8-47ac-974c-b9c85a34479e.ckpt
Restored all states from the checkpoint at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_90994562-d7c8-47ac-974c-b9c85a34479e.ckpt


Output()

`Trainer.fit` stopped: `max_epochs=100` reached.


Output()

/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


## Results

In [12]:
res_df = pd.concat(results)
res_df.style.highlight_max(color="lightgreen",axis=0)

Unnamed: 0,f1,auroc
1st Target (single mode),0.9484,0.97175
2nd Target (single mode),0.6808,0.813438
1st Target (multi-target mode),0.9464,0.969867
2nd Target (multi-target mode),0.6184,0.775763


In this run, we see that multi-target model performed on-par with the single-target on the 1st target, but slightly worse on the 2nd target. This may vary for this artificial dataset depending on random number generation, and in general multi-target may not outperform single-target variants. Additional tuning may be needed for multi-target, e.g. larger embedding size. 

## Deep Learning Model

We can also test whether a deeper model benefits from the shared embedding. In this case, we test Gandalf with multi-target classification, similar to our experiment above.

Note: Without an accelerator (GPU), this training will take considerable time on a CPU

In [13]:
from pytorch_tabular.models import GANDALFConfig

In [14]:
data_config_2nd = DataConfig(
    target=['target', 'second_target'],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)

trainer_gl_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=50,
    early_stopping="valid_loss", # Monitor valid_loss for early stopping
    early_stopping_mode = "min", # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=5, # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss", # Save best checkpoint monitoring val_loss
    load_best=True, # After training, load the best checkpoint
    # accelerator="cpu"
)

model_config_gandalf = GANDALFConfig(task='classification',
                            metrics=["f1_score","accuracy","auroc"], 
                            metrics_prob_input=[True, False, True], # f1_score needs probability scores, while accuracy doesn't,
                            gflu_stages=6, gflu_dropout=0.2
)


tabular_model = TabularModel(
    data_config=data_config_2nd,
    model_config=model_config_gandalf,
    optimizer_config=optimizer_config,
    trainer_config=trainer_gl_config,
)

tabular_model.fit(train=train, validation=val)

result = tabular_model.evaluate(test)

result = {k: float(v) for k,v in result[0].items()}
result1 = pd.DataFrame({'f1':result['test_f1_score_0'],'auroc':result['test_auroc_0']},
             index=['1st Target (Gandalf, multi-target mode)'])
results.append(result1)
result2 = pd.DataFrame({'f1':result['test_f1_score_1'],'auroc':result['test_auroc_1']},
             index=['2nd Target (Gandalf, multi-target mode)'])
results.append(result2)

pd.concat(results)

Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /Users/yony/Code/pytorch-tabular-public/docs/tutorials/saved_models exists and is not empty.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_co

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.10964781961431852
Restoring states from the checkpoint path at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_92ae2770-372c-495e-9410-7ff7723d419e.ckpt
Restored all states from the checkpoint at /Users/yony/Code/pytorch-tabular-public/docs/tutorials/.lr_find_92ae2770-372c-495e-9410-7ff7723d419e.ckpt


Output()

Output()

/Users/yony/Code/pytorch-tabular-public/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Unnamed: 0,f1,auroc
1st Target (single mode),0.9484,0.97175
2nd Target (single mode),0.6808,0.813438
1st Target (multi-target mode),0.9464,0.969867
2nd Target (multi-target mode),0.6184,0.775763
"1st Target (Gandalf, multi-target mode)",0.912,0.951056
"2nd Target (Gandalf, multi-target mode)",0.2988,0.500382


## Results Summary

Here we see that the deeper Gandalf model with multi-target performed worse than either variant of the Category Embedding Model. As before, additional hyperparameter tuning may be required for a fair comparison.