In [3]:
import category_encoders as ce
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from pytorch_tabular.utils import print_metrics, load_covertype_dataset

# %load_ext autoreload
# %autoreload 2

In [2]:
def load_classification_data():
    data, cat_col_names, num_col_names, target = load_covertype_dataset()
    test_idx = data.sample(int(0.2 * len(data)), random_state=42).index
    test = data[data.index.isin(test_idx)]
    train = data[~data.index.isin(test_idx)]
    return (train, test, cat_col_names, num_col_names, target)

# Load Forest Cover Data

In [4]:
train, test, cat_col_names, num_col_names, target_col = load_classification_data()
train, val = train_test_split(train, random_state=42)

In [5]:
encoder = ce.OneHotEncoder(cols=cat_col_names)
train_transform = encoder.fit_transform(train)
val_transform = encoder.transform(val)
test_transform = encoder.transform(test)

KeyboardInterrupt: 

In [None]:
1

## Baseline

Let's use the default LightGBM model as a baseline.

In [6]:
results = []
metrics = [
    (accuracy_score, "Accuracy", {}),
    (f1_score, "F1", {"average": "weighted"}),
]

In [7]:
clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)
clf.fit(
    train_transform.drop(columns=target_col),
    train_transform[target_col].values.ravel(),
)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_metrics = print_metrics(
    metrics, val_transform[target_col], val_pred, "Validation", return_dict=True
)
test_pred = clf.predict(test_transform.drop(columns="target"))
holdout_metrics = print_metrics(
    metrics, test_transform[target_col], test_pred, "Holdout", return_dict=True
)

Validation Accuracy: 0.8528953641472251 | Validation F1: 0.8522624083460651
Holdout Accuracy: 0.8517323281871224 | Holdout F1: 0.8509643250597374


In [8]:
results.append(
    {
        "Mode": "OneHot Encoding",
        "Validation Acc": val_metrics["Accuracy"],
        "Validation F1": val_metrics["F1"],
        "Holdout Acc": holdout_metrics["Accuracy"],
        "Holdout F1": holdout_metrics["F1"],
    }
)

## CategoryEmbedding Model

In [9]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [19]:
data_config = DataConfig(
    target=target_col,  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=50,
    accelerator="auto",  # can be 'cpu','gpu', 'tpu', or 'ipu'
    devices=-1,  # -1 means use all available
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",  # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming",
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="512-256-16",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    dropout=0.1,
    initialization="kaiming",
    head="LinearHead",  # Linear Head
    head_config=head_config,  # Linear Head Config
    learning_rate=1e-3,
    metrics=["accuracy", "f1_score"],
    metrics_params=[{}, {"average": "micro"}],
    metrics_prob_input=[False, True],
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)

In [20]:
tabular_model.fit(train=train)

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `n

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Trainer was signaled to stop but the required `min_epochs=1` or `min_steps=None` has not been met. Training will continue...
`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.002754228703338169
Restoring states from the checkpoint path at /home/manujosephv/pytorch_tabular/docs/tutorials/.lr_find_3ed4fd48-3a47-4500-9411-561d71d08a10.ckpt
Restored all states from the checkpoint at /home/manujosephv/pytorch_tabular/docs/tutorials/.lr_find_3ed4fd48-3a47-4500-9411-561d71d08a10.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x7fdddcb39310>

In [21]:
result = tabular_model.evaluate(test)
print(result)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


[{'test_loss': 0.18335498869419098, 'test_accuracy': 0.9229703545570374, 'test_f1_score': 0.9229703545570374}]


In [22]:
pred_df = tabular_model.predict(test)

In [23]:
print_metrics(metrics, test["target"], pred_df["prediction"], tag="Holdout")

Holdout Accuracy: 0.9229703447444967 | Holdout F1: 0.9227222032785959


## Extract the Learned Embedding

For the models that support (CategoryEmbeddingModel and CategoryEmbeddingNODE), we can extract the learned embeddings into a sci-kit learn style Transformer. You can use this in your Sci-kit Learn pipelines and workflows as a drop in replacement.

In [24]:
transformer = CategoricalEmbeddingTransformer(tabular_model)
train_transform = transformer.fit_transform(train)
clf = lgb.LGBMClassifier(random_state=42, verbose=-1)
clf.fit(train_transform.drop(columns="target"), train_transform["target"])

  embedding.weight[self._categorical_encoder._mapping[col].loc[key], :]


Output()

In [25]:
val_transform = transformer.transform(val)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_metrics = print_metrics(
    metrics, val_transform[target_col], val_pred, "Validation", return_dict=True
)
test_transform = transformer.transform(test)
test_pred = clf.predict(test_transform.drop(columns=target_col))
holdout_metrics = print_metrics(
    metrics, test_transform[target_col], test_pred, "Holdout", return_dict=True
)

Output()

Output()

Validation Accuracy: 0.8464067192757502 | Validation F1: 0.8458404314554572


Holdout Accuracy: 0.8465603001669506 | Holdout F1: 0.8458808412616188


In [27]:
results.append(
    {
        "Mode": "NeuralEmbedding",
        "Validation Acc": val_metrics["Accuracy"],
        "Validation F1": val_metrics["F1"],
        "Holdout Acc": holdout_metrics["Accuracy"],
        "Holdout F1": holdout_metrics["F1"],
    }
)

In [28]:
res_df = pd.DataFrame(results).T
res_df.columns = res_df.iloc[0]
res_df = res_df.iloc[1:].astype(float)
res_df

Mode,OneHot Encoding,NeuralEmbedding
Validation Acc,0.852895,0.846407
Validation F1,0.852262,0.84584
Holdout Acc,0.851732,0.84656
Holdout F1,0.850964,0.845881
