In [2]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

# Download census-income dataset

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [4]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

File already exists.


# Load data and split

In [5]:
train = pd.read_csv(out, header=None)
# train = pd.read_csv(out) # falsch
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [6]:
train.dtypes

0       int64
1      object
2       int64
3      object
4       int64
5      object
6      object
7      object
8      object
9      object
10      int64
11      int64
12      int64
13     object
14     object
Set    object
dtype: object

In [7]:
len(train)

32561

In [1]:
train.columns

NameError: name 'train' is not defined

In [8]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

0 73
1 9
3 16
4 16
5 7
6 15
7 6
8 5
9 2
10 119
11 92
12 94
13 42
14 2
Set 3


In [9]:
categorical_columns

[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 'Set']

In [10]:
train.dtypes

0      int64
1      int32
2      int64
3      int32
4      int64
5      int32
6      int32
7      int32
8      int32
9      int32
10     int64
11     int64
12     int64
13     int32
14     int32
Set    int32
dtype: object

In [11]:
categorical_columns

[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 'Set']

In [12]:
target=14

# Define categorical features for categorical embeddings

In [13]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]


In [14]:
features

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

NB: Column index 2 is not a categorical variable

In [15]:
cat_idxs

[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [16]:
cat_dims

[73, 9, 16, 16, 7, 15, 6, 5, 2, 119, 92, 94, 42]

# Network parameters

In [2]:
torch.cuda.is_available()

True

In [17]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":50, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                      )

Device used : cpu


# Training

In [17]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [19]:
max_epochs = 1000 if not os.getenv("CI", False) else 2

In [29]:
loss_fn = torch.nn.functional.cross_entropy
weights = 1
patience = 20
batch_size = 1024
virtual_batch_size = 128
num_workers = 0
drop_last = False

## fit

### Details

#### TabNetClassifier

In [31]:
from pytorch_tabnet.tab_network import TabNet
from pytorch_tabnet.tab_model import TabModel, TabNetClassifier

##### update_fit_params

In [32]:
clf.update_fit_params(X_train, y_train, X_valid, y_valid, loss_fn, weights, max_epochs, patience, batch_size, virtual_batch_size, num_workers, drop_last)

In [36]:
clf.updated_weights

1

In [37]:
clf.best_cost

inf

In [38]:
clf.output_dim

2

##### construct_loaders

In [39]:
trainloader, validloader = clf.construct_loaders(X_train, y_train, X_valid, y_valid, clf.updated_weights, clf.batch_size, clf.num_workers, clf.drop_last)

In [41]:
x, y = next(iter(trainloader))

In [47]:
batch_size

1024

In [49]:
x.shape, y.shape

(torch.Size([1024, 14]), torch.Size([1024]))

##### init_network

In [None]:
network = TabNet()

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    X_valid=X_valid, y_valid=y_valid,
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
) 

In [None]:
# plot losses
plt.plot(clf.history['train']['loss'])
plt.plot(clf.history['valid']['loss'])

In [None]:
# plot auc
plt.plot([-x for x in clf.history['train']['metric']])
plt.plot([-x for x in clf.history['valid']['metric']])

In [None]:
# plot learning rates
plt.plot([x for x in clf.history['train']['lr']])

## Predictions

In [None]:
preds = clf.predict_proba(X_test)
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

# Save and load Model

In [None]:
# save tabnet model
saving_path_name = "./tabnet_model_test_1"
saved_filepath = clf.save_model(saving_path_name)

In [None]:
# define new model with basic parameters and load state dict weights
loaded_clf = TabNetClassifier()
loaded_clf.load_model(saved_filepath)

In [None]:
loaded_preds = loaded_clf.predict_proba(X_test)
loaded_test_auc = roc_auc_score(y_score=loaded_preds[:,1], y_true=y_test)

print(f"FINAL TEST SCORE FOR {dataset_name} : {loaded_test_auc}")

In [None]:
assert(test_auc == loaded_test_auc)

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")


# XGB

In [None]:
from xgboost import XGBClassifier

clf_xgb = XGBClassifier(max_depth=8,
    learning_rate=0.1,
    n_estimators=1000,
    verbosity=0,
    silent=None,
    objective='binary:logistic',
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf_xgb.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=40,
        verbose=10)

In [None]:
preds = np.array(clf_xgb.predict_proba(X_valid))
valid_auc = roc_auc_score(y_score=preds[:,1], y_true=y_valid)
print(valid_auc)

preds = np.array(clf_xgb.predict_proba(X_test))
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)
print(test_auc)