In [1]:
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

# Download census-income dataset

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [3]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

File already exists.


# Load data and split

In [4]:
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [5]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

 State-gov 9
 Bachelors 16
 Never-married 7
 Adm-clerical 15
 Not-in-family 6
 White 5
 Male 2
 United-States 42
 <=50K 2
Set 3


# Define categorical features for categorical embeddings

In [6]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

# define your embedding sizes : here just a random choice
cat_emb_dim = [5, 4, 3, 6, 2, 2, 1, 10]

# Network parameters

In [12]:
cat_dims

[9, 16, 7, 15, 6, 5, 2, 42]

In [7]:
clf = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)

Device used : cuda


# Training

### We will simulate 5 targets here to perform multi regression without changing anything!

In [10]:
y_train.shape

(26072, 8)

In [8]:
n_targets = 8

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]
y_train = np.transpose(np.tile(y_train, (n_targets,1)))

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]
y_valid = np.transpose(np.tile(y_valid, (n_targets,1)))

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]
y_test = np.transpose(np.tile(y_test, (n_targets,1)))

In [9]:
max_epochs = 1000 if not os.getenv("CI", False) else 2

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    X_valid=X_valid, y_valid=y_valid,
    max_epochs=max_epochs,
    patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False
) 

Will train until validation stopping metric hasn't improved in 50 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | -0.34480 |  -0.49846 |   7.4       
| 2     | -0.14695 |  -0.19283 |   10.7      
| 3     | -0.13052 |  -0.21267 |   14.2      
| 4     | -0.12491 |  -0.16150 |   17.6      
| 5     | -0.12119 |  -0.15077 |   21.0      
| 6     | -0.11902 |  -0.12248 |   24.4      
| 7     | -0.11552 |  -0.12136 |   27.9      
| 8     | -0.11358 |  -0.11512 |   31.4      
| 9     | -0.11161 |  -0.10789 |   34.7      
| 10    | -0.11047 |  -0.10758 |   38.3      
| 11    | -0.10948 |  -0.10497 |   41.7      
| 12    | -0.10831 |  -0.10300 |   45.1      
| 13    | -0.10748 |  -0.10234 |   48.6      
| 14    | -0.10769 |  -0.10131 |   51.9      
| 15    | -0.10556 |  -0.10097 |   55.3      
| 16    | -0.10568 |  -0.10244 |   59.0      
| 17    | -0.10428 |  -0.09981 |   62.5      
| 18    | -0.10341 |  -0.10108 |   65.9      
| 19    | -0

In [None]:
# Deprecated : best model is automatically loaded at end of fit
# clf.load_best_model()

preds = clf.predict(X_test)

y_true = y_test

test_score = mean_squared_error(y_pred=preds, y_true=y_true)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_score}")

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")


# XGB : unfortunately this is still not possible with XGBoost

https://github.com/dmlc/xgboost/issues/2087