In [1]:
import os
from sklearn.datasets import fetch_covtype
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import category_encoders as ce
# %load_ext autoreload
# %autoreload 2

# Utility Functions

In [2]:
def load_classification_data():
    dataset = fetch_covtype(data_home="data")
    data = np.hstack([dataset.data, dataset.target.reshape(-1, 1)])
    col_names = [f"feature_{i}" for i in range(data.shape[-1])]
    col_names[-1] = "target"
    data = pd.DataFrame(data, columns=col_names)
    data["feature_0_cat"] = pd.qcut(data["feature_0"], q=4)
    data["feature_0_cat"] = "feature_0_" + data.feature_0_cat.cat.codes.astype(str)
    test_idx = data.sample(int(0.2 * len(data)), random_state=42).index
    test = data[data.index.isin(test_idx)]
    train = data[~data.index.isin(test_idx)]
    return (train, test, ["target"])

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = accuracy_score(y_true, y_pred)
    val_f1 = f1_score(y_true, y_pred, average="macro")
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")
    return val_acc, val_f1

# Load Forest Cover Data

In [3]:
train, test, target_col = load_classification_data()
train, val = train_test_split(train, random_state=42)

In [4]:
cat_col_names = ["feature_0_cat"]
num_col_names = [col for col in train.columns if col not in cat_col_names+target_col]

In [5]:
encoder = ce.OneHotEncoder(cols=cat_col_names)
train_transform = encoder.fit_transform(train)
val_transform = encoder.transform(val)
test_transform = encoder.transform(test)

  for cat_name, class_ in values.iteritems():


## Baseline

Let's use the default LightGBM model as a baseline.

In [6]:
results = []

In [7]:
clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
clf.fit(train_transform.drop(columns=target_col), train_transform[target_col].values.ravel())
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_acc, val_f1 = print_metrics(val_transform[target_col], val_pred, "Validation")
test_pred = clf.predict(test_transform.drop(columns='target'))
holdout_acc, holdout_f1 = print_metrics(test_transform[target_col], test_pred, "Holdout")

Validation Acc: 0.8528953641472251 | Validation F1: 0.825508819288814
Holdout Acc: 0.8517323281871224 | Holdout F1: 0.8175407970429683


In [8]:
results.append({
    "Mode": "OneHot Encoding",
    "Validation Acc": val_acc,
    "Validation F1": val_f1,
    "Holdout Acc": holdout_acc,
    "Holdout F1": holdout_f1,
})

## CategoryEmbedding Model

In [9]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [10]:
data_config = DataConfig(
    target=target_col, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=50,
    accelerator="auto", # can be 'cpu','gpu', 'tpu', or 'ipu' 
    devices=-1 # -1 means use all available
)
optimizer_config = OptimizerConfig()

# DEPRECATED
# prediction head is defined separately now and head & head_config will be made
# mandatory in future releases
# model_config = CategoryEmbeddingModelConfig(
#     task="classification",
#     layers="4096-4096-512",  # Number of nodes in each layer
#     activation="LeakyReLU", # Activation between each layers
#     learning_rate = 1e-3,
#     metrics=["accuracy", "f1"],
#     metrics_params=[{},{"average":"micro"}]
# )
head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="4096-4096-512", # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    dropout=0.1,
    initialization="kaiming",
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
    learning_rate = 1e-3,
    metrics=["accuracy", "f1_score"],
    metrics_params=[{},{"average":"micro"}],
    metrics_prob_input=[False, True]
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [11]:
tabular_model.fit(train=train)

  rank_zero_deprecation(
Global seed set to 42
  X_encoded.loc[:, col] = X_encoded[col].fillna(NAN_CATEGORY).map(mapping["value"])
  X_encoded.loc[:, col] = X_encoded[col].fillna(NAN_CATEGORY).map(mapping["value"])
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=86` reached.
LR finder stopped early after 86 steps due to diverging loss.
Learning rate set to 7.585775750291836e-06
Restoring states from the checkpoint path at /home/manujosephv/pytorch_tabular/docs/.lr_find_6b326707-446a-4ed5-89d7-be598f17d4b2.ckpt
Restored all states from the checkpoint file at /home/manujosephv/pytorch_tabular/docs/.lr_find_6b326707-446a-4ed5-89d7-be598f17d4b2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                      | Params
---------------------------------------------------------------
0 | _backbone        | CategoryEmbeddingBackbone | 19.1 M
1 | _embedding_layer | Embedding1dLayer          | 123   
2 | head             | LinearHead                | 3.6 K 
3 | loss             | CrossEntropyLoss          | 0     
---------------------------------------------------------------
19.1 M    Trainable params
0         Non-trainable params
19.1 M    Total params
76.481    Total estimated model p

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.
  rank_zero_deprecation(


<pytorch_lightning.trainer.trainer.Trainer at 0x7f83f94e6b60>

In [12]:
result = tabular_model.evaluate(test)
print(result)

  X_encoded.loc[:, col] = X_encoded[col].fillna(NAN_CATEGORY).map(mapping["value"])
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.3751767575740814, 'test_accuracy': 0.8502951860427856, 'test_f1_score': 0.8502951860427856}]


To get the prediction as a dataframe, we can use the `predict` method. This will add predictions to the same dataframe that was passed in. For classification problems, we get both the probabilities and the final prediction taking 0.5 as the threshold

In [13]:
pred_df = tabular_model.predict(test)
pred_df.head()

  X_encoded.loc[:, col] = X_encoded[col].fillna(NAN_CATEGORY).map(mapping["value"])


Generating Predictions...:   0%|          | 0/114 [00:00<?, ?it/s]

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,target,feature_0_cat,1.0_probability,2.0_probability,3.0_probability,4.0_probability,5.0_probability,6.0_probability,7.0_probability,prediction
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,5.0,feature_0_0,0.018046,0.607462,4.9e-05,2.1e-05,0.374148,0.000198,7.6e-05,2.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,2.0,feature_0_0,0.083914,0.915731,2.5e-05,4.3e-05,0.000135,1.5e-05,0.000137,2.0
6,2606.0,45.0,7.0,270.0,5.0,633.0,222.0,225.0,138.0,6256.0,...,5.0,feature_0_0,0.022408,0.799302,3.8e-05,1.9e-05,0.178057,0.000135,4.1e-05,2.0
7,2605.0,49.0,4.0,234.0,7.0,573.0,222.0,230.0,144.0,6228.0,...,5.0,feature_0_0,0.018728,0.662217,5.7e-05,2.6e-05,0.318685,0.000212,7.6e-05,2.0
12,2742.0,134.0,22.0,150.0,69.0,3215.0,248.0,224.0,92.0,6091.0,...,2.0,feature_0_0,0.006496,0.824902,0.00026,0.00016,0.168011,4.9e-05,0.000122,2.0


In [14]:
print_metrics(test['target'], pred_df["prediction"], tag="Holdout")

Holdout Acc: 0.8502951756424157 | Holdout F1: 0.7648732188985632


(0.8502951756424157, 0.7648732188985632)

## Extract the Learned Embedding

For the models that support (CategoryEmbeddingModel and CategoryEmbeddingNODE), we can extract the learned embeddings into a sci-kit learn style Transformer. You can use this in your Sci-kit Learn pipelines and workflows as a drop in replacement.

In [15]:
transformer = CategoricalEmbeddingTransformer(tabular_model)
train_transform = transformer.fit_transform(train)
clf = lgb.LGBMClassifier(random_state=42)
clf.fit(train_transform.drop(columns='target'), train_transform['target'])

Encoding the data...:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
val_transform = transformer.transform(val)
val_pred = clf.predict(val_transform.drop(columns=target_col))
val_acc, val_f1 = print_metrics(val_transform[target_col], val_pred, "Validation")
test_transform = transformer.transform(test)
test_pred = clf.predict(test_transform.drop(columns=target_col))
holdout_acc, holdout_f1 = print_metrics(test_transform[target_col], test_pred, "Holdout")

Encoding the data...:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Acc: 0.8513893789316971 | Validation F1: 0.81951870361744


Encoding the data...:   0%|          | 0/1 [00:00<?, ?it/s]

Holdout Acc: 0.8497444105953426 | Holdout F1: 0.8116797898686988


In [17]:
results.append({
    "Mode": "OneHot Encoding",
    "Validation Acc": val_acc,
    "Validation F1": val_f1,
    "Holdout Acc": holdout_acc,
    "Holdout F1": holdout_f1,
})

In [21]:
res_df = pd.DataFrame(results).T
res_df.columns = res_df.iloc[0]
res_df = res_df.iloc[1:].astype(float)
res_df

Mode,OneHot Encoding,OneHot Encoding.1
Validation Acc,0.852895,0.851389
Validation F1,0.825509,0.819519
Holdout Acc,0.851732,0.849744
Holdout F1,0.817541,0.81168
