In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score
import os
os.chdir("..")

In [2]:
def make_mixed_classification(n_samples, n_features, n_categories):
    X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y, name="target")
    data = X.join(y)
    return data, cat_col_names, num_col_names

In [3]:
data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

## Baseline

In [4]:
clf = lgb.LGBMClassifier(random_state=42)
clf.fit(train.drop(columns='target'), train['target'], categorical_feature=cat_col_names)

LGBMClassifier(random_state=42)

In [5]:
val_pred = clf.predict(val.drop(columns='target'))
val_acc = accuracy_score(val['target'].values.ravel(), val_pred)
val_f1 = f1_score(val['target'].values.ravel(), val_pred)
print(f"Val Acc: {val_acc} | Val F1: {val_f1}")

Val Acc: 0.936 | Val F1: 0.9363057324840763


In [6]:
test_pred = clf.predict(test.drop(columns='target'))
test_acc = accuracy_score(test['target'].values.ravel(), test_pred)
test_f1 = f1_score(test['target'].values.ravel(), test_pred)
print(f"Test Acc: {test_acc} | Test F1: {test_f1}")

Test Acc: 0.9356 | Test F1: 0.936986301369863


In [7]:
from pytorch_tabular import TabularModel, CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.category_encoders import CategoricalEmbeddingTransformer

## Category Embedding Model

In [8]:
data_config = DataConfig(
    target=['target'],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform=None,#"yeo-johnson",
    normalize_continuous_features=True
)
trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=1024,
    max_epochs=1000,
    gpus=1,
    # track_grad_norm=2,
    gradient_clip_val=10,
)
# experiment_config = ExperimentConfig(project_name="Tabular_test", log_logits=True)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="128-64-16", 
    activation="LeakyReLU",
    learning_rate = 1e-3
    # metrics=["auroc",""]
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [9]:
tabular_model.fit(train=train, test=test)

GPU available: True, used: False
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
GPU available: True, used: True
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                   | Type             | Params
------------------------------------------------------------
0 | embedding_layers       | ModuleList       | 45    
1 | normalizing_batch_norm | BatchNorm1d      | 34    
2 | linear_layers          | Sequential       | 12.8 K
3 | loss                   | CrossEntropyLoss | 0     

  | Name                   | Type             | Params
------------------------------------------------------------
0 | embedding_layers       | ModuleList       | 45    
1 | normalizing_batch_norm | BatchNorm1d      | 34    
2 | linear_layers          | Sequential       | 12.8 K
3 |

In [10]:
result = tabular_model.evaluate(test)
print(result)

Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.8788, device='cuda:0'),
 'train_accuracy': tensor(0.6243, device='cuda:0'),
 'train_loss': tensor(0.5143, device='cuda:0'),
 'valid_accuracy': tensor(0.8916, device='cuda:0'),
 'valid_loss': tensor(0.3676, device='cuda:0')}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 3/3 [00:00<00:00, 38.08it/s]
[{'train_loss': 0.5142579078674316, 'valid_loss': 0.3675748109817505, 'valid_accuracy': 0.8915555477142334, 'train_accuracy': 0.6242941617965698, 'test_accuracy': 0.8787999749183655}]


In [11]:
cat_embed_pred_df = tabular_model.predict(test)

### Use Category Embedding

In [12]:
transformer = CategoricalEmbeddingTransformer(tabular_model)
transf_train = transformer.fit_transform(train)
clf = lgb.LGBMClassifier(random_state=42)
clf.fit(transf_train.drop(columns='target'), transf_train['target'])

LGBMClassifier(random_state=42)

In [14]:
transf_val = transformer.transform(val)
val_pred = clf.predict(transf_val.drop(columns='target'))
val_acc = accuracy_score(transf_val['target'].values.ravel(), val_pred)
val_f1 = f1_score(transf_val['target'].values.ravel(), val_pred)
print(f"Val Acc: {val_acc} | Val F1: {val_f1}")

Val Acc: 0.9354666666666667 | Val F1: 0.9357408390865639


In [15]:
transf_test = transformer.transform(test)
test_pred = clf.predict(transf_test.drop(columns='target'))
test_acc = accuracy_score(transf_test['target'].values.ravel(), test_pred)
test_f1 = f1_score(transf_test['target'].values.ravel(), test_pred)
print(f"Test Acc: {test_acc} | Test F1: {test_f1}")

Test Acc: 0.9352 | Test F1: 0.9366692728694292


## NODE

In [16]:
data_config = DataConfig(
    target=['target'],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform=None,#"yeo-johnson",
    normalize_continuous_features=True
)
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=64,
    max_epochs=1000,
    gpus=1,
    # track_grad_norm=2,
    gradient_clip_val=10,
)
# experiment_config = ExperimentConfig(project_name="Tabular_test", log_logits=True)
optimizer_config = OptimizerConfig()

model_config = NodeConfig(
    task="classification",
    num_layers=2,
    num_trees=1024,
    learning_rate=1,
    embed_categorical=False,
    # metrics=["MeanSquaredLogError"],
    # target_range=(train['block_0'].min().item(), train['block_0'].max().item())
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [17]:
tabular_model.fit(train=train, test=test)

Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns
GPU available: True, used: False
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
GPU available: True, used: True
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type             | Params
-----------------------------------------------------
0 | dense_block     | DenseODSTBlock   | 13.1 M
1 | output_response | Lambda           | 0     
2 | loss            | CrossEntropyLoss | 0     

  | Name            | Type             | Params
-----------------------------------------------------
0 | dense_block     | DenseODSTBlock   | 13.1 M
1 | output_response | Lambda           | 0     
2 | loss            | CrossEntropyLoss | 0     
Finding best initial 

In [18]:
result = tabular_model.evaluate(test)
print(result)

Testing: 100%|██████████| 40/40 [00:06<00:00,  6.79it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.8996, device='cuda:0'),
 'train_accuracy': tensor(0.8809, device='cuda:0'),
 'train_loss': tensor(0.2498, device='cuda:0'),
 'valid_accuracy': tensor(0.8951, device='cuda:0'),
 'valid_loss': tensor(0.3093, device='cuda:0')}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 40/40 [00:06<00:00,  6.64it/s]
[{'train_loss': 0.24976079165935516, 'valid_loss': 0.30926480889320374, 'valid_accuracy': 0.895111083984375, 'train_accuracy': 0.8808538913726807, 'test_accuracy': 0.8996000289916992}]


In [20]:
node_pred_df = tabular_model.predict(test)

## NODE (Cat Embed)

In [21]:
data_config = DataConfig(
    target=['target'],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform=None,#"yeo-johnson",
    normalize_continuous_features=True
)
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=64,
    max_epochs=1000,
    gpus=1,
    # track_grad_norm=2,
    gradient_clip_val=10,
)
# experiment_config = ExperimentConfig(project_name="Tabular_test", log_logits=True)
optimizer_config = OptimizerConfig()

model_config = NodeConfig(
    task="classification",
    num_layers=2,
    num_trees=1024,
    learning_rate=1,
    embed_categorical=True,
    # metrics=["MeanSquaredLogError"],
    # target_range=(train['block_0'].min().item(), train['block_0'].max().item())
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [22]:
tabular_model.fit(train=train, test=test)

0:22<00:00,  4.38it/s][A
Finding best initial lr: 100%|██████████| 100/100 [00:23<00:00,  4.38it/s][ALearning rate set to 0.10964781961431852
Learning rate set to 0.10964781961431852
Epoch 4: 100%|██████████| 89/89 [01:35<00:00,  1.57s/it, loss=0.317, train_loss=0.25, valid_loss=0.309, valid_accuracy=0.895, train_accuracy=0.873]
Finding best initial lr: 100%|██████████| 100/100 [00:23<00:00,  4.28it/s]


  | Name              | Type             | Params
-------------------------------------------------------
0 | embedding_layers  | ModuleList       | 45    
1 | embedding_dropout | Dropout          | 0     
2 | dense_block       | DenseODSTBlock   | 13.2 M
3 | output_response   | Lambda           | 0     
4 | loss              | CrossEntropyLoss | 0     

  | Name              | Type             | Params
-------------------------------------------------------
0 | embedding_layers  | ModuleList       | 45    
1 | embedding_dropout | Dropout          | 0     
2 | dense_block       | Den

In [23]:
result = tabular_model.evaluate(test)
print(result)

Testing: 100%|██████████| 40/40 [00:06<00:00,  6.69it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.8484, device='cuda:0'),
 'train_accuracy': tensor(0.7829, device='cuda:0'),
 'train_loss': tensor(0.3493, device='cuda:0'),
 'valid_accuracy': tensor(0.8587, device='cuda:0'),
 'valid_loss': tensor(0.3570, device='cuda:0')}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 40/40 [00:06<00:00,  6.54it/s]
[{'train_loss': 0.3492753207683563, 'valid_loss': 0.3570120930671692, 'valid_accuracy': 0.8586666584014893, 'train_accuracy': 0.7828785181045532, 'test_accuracy': 0.8483999967575073}]


In [24]:
cat_embed_node_pred_df = tabular_model.predict(test)

### Use Category embedding

In [25]:
transformer = CategoricalEmbeddingTransformer(tabular_model)
transf_train = transformer.fit_transform(train)
clf = lgb.LGBMClassifier(random_state=42)
clf.fit(transf_train.drop(columns='target'), transf_train['target'])

LGBMClassifier(random_state=42)

In [26]:
transf_val = transformer.transform(val)
val_pred = clf.predict(transf_val.drop(columns='target'))
val_acc = accuracy_score(transf_val['target'].values.ravel(), val_pred)
val_f1 = f1_score(transf_val['target'].values.ravel(), val_pred)
print(f"Val Acc: {val_acc} | Val F1: {val_f1}")

Val Acc: 0.9322666666666667 | Val F1: 0.932410856838744


In [27]:
transf_test = transformer.transform(test)
test_pred = clf.predict(transf_test.drop(columns='target'))
test_acc = accuracy_score(transf_test['target'].values.ravel(), test_pred)
test_f1 = f1_score(transf_test['target'].values.ravel(), test_pred)
print(f"Test Acc: {test_acc} | Test F1: {test_f1}")

Test Acc: 0.9368 | Test F1: 0.9383294301327089


In [24]:
## TabNet

In [25]:
data_config = DataConfig(
    target=['target'],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform=None,#"yeo-johnson",
    normalize_continuous_features=True
)
trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=1024,
    max_epochs=1000,
    gpus=1,
    # track_grad_norm=2,
    gradient_clip_val=10,
)
# experiment_config = ExperimentConfig(project_name="Tabular_test", log_logits=True)
optimizer_config = OptimizerConfig()

model_config = TabNetModelConfig(
    task="classification",
    n_d=5,
    n_a=5,
    n_steps=2,
    n_independent=2,
    n_shared=2,
    learning_rate=1e-3
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

7/7 [00:00<00:00, 19.07it/s, loss=0.613, train_loss=0.605, valid_loss=0.612, valid_accuracy=0.676, train_accuracy=0.672]
Epoch 20:  71%|███████▏  | 5/7 [00:00<00:00, 16.99it/s, loss=0.608, train_loss=0.634, valid_loss=0.612, valid_accuracy=0.676, train_accuracy=0.67]
Epoch 20: 100%|██████████| 7/7 [00:00<00:00, 19.12it/s, loss=0.608, train_loss=0.634, valid_loss=0.604, valid_accuracy=0.687, train_accuracy=0.67]
Epoch 21:  71%|███████▏  | 5/7 [00:00<00:00, 17.00it/s, loss=0.599, train_loss=0.582, valid_loss=0.604, valid_accuracy=0.687, train_accuracy=0.68]
Epoch 21: 100%|██████████| 7/7 [00:00<00:00, 19.29it/s, loss=0.599, train_loss=0.582, valid_loss=0.596, valid_accuracy=0.697, train_accuracy=0.68]
Epoch 22:  71%|███████▏  | 5/7 [00:00<00:00, 16.82it/s, loss=0.592, train_loss=0.587, valid_loss=0.596, valid_accuracy=0.697, train_accuracy=0.696]
Epoch 22: 100%|██████████| 7/7 [00:00<00:00, 18.72it/s, loss=0.592, train_loss=0.587, valid_loss=0.589, valid_accuracy=0.708, train_accuracy=0.

In [26]:
tabular_model.fit(train=train, test=test)

Testing: 100%|██████████| 3/3 [00:00<00:00, 27.85it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.8600, device='cuda:0'),
 'train_accuracy': tensor(0.8683, device='cuda:0'),
 'train_loss': tensor(0.3278, device='cuda:0'),
 'valid_accuracy': tensor(0.8773, device='cuda:0'),
 'valid_loss': tensor(0.3365, device='cuda:0')}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 3/3 [00:00<00:00, 23.87it/s]
[{'train_loss': 0.3278144598007202, 'valid_loss': 0.3364557921886444, 'valid_accuracy': 0.8773333430290222, 'train_accuracy': 0.8682800531387329, 'test_accuracy': 0.8600000143051147}]


In [27]:
result = tabular_model.evaluate(test)
print(result)