In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

# Download the Data

[Source](https://archive.ics.uci.edu/ml/datasets/bank+marketing)

In [2]:
np.random.seed(42)

X, y = fetch_openml("Bank_marketing_data_set_UCI", version=1, as_frame=True, return_X_y=True)
data = X.join(y)
del X, y
data.head()

  warn(


In [4]:
cat_cols = ['job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'day', 'month', 'campaign',
       'previous', 'poutcome']

num_cols = ['age', 'balance', 'duration', 'pdays']
target=["y"]

In [24]:
train, test = train_test_split(data, stratify=data["y"], test_size=0.2, random_state=42)

# LightGBM

In [27]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OrdinalEncoder

# LightGBM needs categorical columns encoded as integers
train_enc = train.copy()
test_enc = test.copy()
for col in cat_cols:
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", encoded_missing_value=np.nan, unknown_value=np.nan)
    train_enc[col] = enc.fit_transform(train_enc[col].values.reshape(-1,1))
    test_enc[col] = enc.transform(test_enc[col].values.reshape(-1,1))

In [37]:
clf = LGBMClassifier(random_state=42)
clf.fit(train_enc.drop(columns=target[0]), train_enc[target], categorical_feature=cat_cols)
test_pred = clf.predict(test_enc.drop(columns=target[0]))
test_pred_proba = clf.predict_proba(test_enc.drop(columns=target[0]))

acc = accuracy_score(test[target[0]].values, test_pred)
loss = log_loss(test[target[0]].values, test_pred_proba)
print(f"Acc: {acc} | LogLoss: {loss}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Acc: 0.9083268826716797 | LogLoss: 0.19783125832611875


# PyTorch Tabular

In [13]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig, 
    FTTransformerConfig, 
    TabNetModelConfig, 
    GatedAdditiveTreeEnsembleConfig, 
    TabTransformerConfig, 
    AutoIntConfig
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

## Common Configs    

These are common configs which can be reused. Since the datamodule is very quick, we can just stick with the high-level API

In [7]:
data_config = DataConfig(
    target=target, #target should always be a list.
    continuous_cols=num_cols,
    categorical_cols=cat_cols,
)

trainer_config = TrainerConfig(
#     auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=256,
    max_epochs=500,
    early_stopping="valid_loss", # Monitor valid_loss for early stopping
    early_stopping_mode = "min", # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=5, # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss", # Save best checkpoint monitoring val_loss
    load_best=True, # After training, load the best checkpoint
)

optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

## CategoryEmbedding

In [71]:
model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="64-32",  # Number of nodes in each layer
    activation="ReLU", # Activation between each layers
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)


2023-01-17 16:14:44,087 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 16:14:44,099 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 16:14:44,105 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 16:14:44,288 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: CategoryEmbeddingModel
2023-01-17 16:14:44,309 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 16:14:44,374 - {pytorch_tabular.tabular_model:556} - INFO - Auto LR Find Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.0022908676527677745
Restoring states from the checkpoint path at /home/manujosephv/pytorch_tabular/examples/.lr_find_014405ad-cd08-4b2b-a0ab-7f467495057b.ckpt
Restored all states from the checkpoint file at /home/manujosephv/pytorch_tabular/examples/.lr_find_014405ad-cd08-4b2b-a0ab-7f467495057b.ckpt
2023-01-17 16:14:45,929 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 16:15:01,553 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 16:15:01,553 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

## GATE (Full)    

[GATE](https://arxiv.org/pdf/2207.08548.pdf) proposes two configuration, a Full (larger) model and a lite (smaller) model.

In [9]:
model_config = GatedAdditiveTreeEnsembleConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

2023-01-17 16:22:56,348 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 16:22:56,361 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 16:22:56,367 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 16:22:56,556 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: GatedAdditiveTreeEnsembleModel
2023-01-17 16:22:56,670 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 16:22:59,485 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 16:44:43,020 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 16:44:43,022 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

## GATE (Lite)

In [11]:
model_config = GatedAdditiveTreeEnsembleConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
    gflu_stages=4,
    num_trees=30,
    tree_depth=5,
    chain_trees=False
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

2023-01-17 16:57:57,436 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 16:57:57,448 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 16:57:57,453 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 16:57:57,621 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: GatedAdditiveTreeEnsembleModel
2023-01-17 16:57:57,756 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 16:57:57,800 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 17:28:18,273 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 17:28:18,274 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

[{'test_loss': 0.21413247287273407, 'test_accuracy': 0.9045670628547668}]

## FT Transformer

[Paper](https://arxiv.org/abs/2106.11959)

In [8]:
model_config = FTTransformerConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

2023-01-17 16:50:56,558 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 16:50:56,577 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 16:50:56,584 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 16:50:56,762 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: FTTransformerModel
2023-01-17 16:50:56,812 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 16:51:01,381 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 16:52:40,978 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 16:52:40,979 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

[{'test_loss': 0.20094546675682068, 'test_accuracy': 0.9103173613548279}]

## TabTransformer    

[Paper](https://arxiv.org/abs/2012.06678)

In [10]:
model_config = TabTransformerConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

2023-01-17 16:55:22,801 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 16:55:22,817 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 16:55:22,825 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 16:55:22,991 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: TabTransformerModel
2023-01-17 16:55:23,014 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 16:55:23,069 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 16:56:35,837 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 16:56:35,837 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

[{'test_loss': 0.2282944619655609, 'test_accuracy': 0.905009388923645}]

## AutoInt    

[Paper](https://arxiv.org/abs/1810.11921)

In [14]:
model_config = AutoIntConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

2023-01-17 17:32:49,809 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 17:32:49,825 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 17:32:49,837 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 17:32:50,008 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: AutoIntModel
2023-01-17 17:32:50,032 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 17:32:50,087 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 17:33:31,016 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 17:33:31,017 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

[{'test_loss': 0.21694879233837128, 'test_accuracy': 0.9039035439491272}]

## TabNet    

[Paper](https://arxiv.org/abs/1908.07442)

In [15]:
model_config = TabNetModelConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

2023-01-17 17:33:44,441 - {pytorch_tabular.tabular_model:101} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-01-17 17:33:44,453 - {pytorch_tabular.tabular_model:463} - INFO - Preparing the DataLoaders
2023-01-17 17:33:44,459 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-01-17 17:33:44,616 - {pytorch_tabular.tabular_model:506} - INFO - Preparing the Model: TabNetModel
2023-01-17 17:33:44,645 - {pytorch_tabular.tabular_model:262} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-01-17 17:33:44,689 - {pytorch_tabular.tabular_model:561} - INFO - Training Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

2023-01-17 17:37:46,206 - {pytorch_tabular.tabular_model:563} - INFO - Training the model completed
2023-01-17 17:37:46,206 - {pytorch_tabular.tabular_model:1174} - INFO - Loading the best model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Output()

[{'test_loss': 0.28120988607406616, 'test_accuracy': 0.8845515847206116}]