In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
np.random.seed(0)

import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
# check if using cuda
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
print(f'device: {device}')

device: cuda


# Load data and split

In [4]:
# target = "Covertype"

# bool_columns = [
#     "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
#     "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4",
#     "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9",
#     "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14",
#     "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
#     "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
#     "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29",
#     "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
#     "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39",
#     "Soil_Type40"
# ]

# int_columns = [
#     "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
#     "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
#     "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
#     "Horizontal_Distance_To_Fire_Points"
# ]

# feature_columns = (
#     int_columns + bool_columns + [target])

In [7]:
# train = pd.read_csv(out, header=None, names=feature_columns)
train = pd.read_csv('data/forest-cover-type.csv', header=None, names=feature_columns)
# train = pd.read_csv('data/synthetic_easy.csv', header=None)

n_total = len(train)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

  train = pd.read_csv('data/synthetic_easy.csv', header=None)


# Simple preprocessing

Label encode categorical features and fill empty cells.

In [9]:
# categorical_columns = []
# categorical_dims =  {}
# for col in train.columns[train.dtypes == object]:
#     print(col, train[col].nunique())
#     l_enc = LabelEncoder()
#     train[col] = train[col].fillna("VV_likely")
#     train[col] = l_enc.fit_transform(train[col].values)
#     categorical_columns.append(col)
#     categorical_dims[col] = len(l_enc.classes_)

# for col in train.columns[train.dtypes == 'float64']:
#     train.fillna(train.loc[train_indices, col].mean(), inplace=True)

# Define categorical features for categorical embeddings

In [None]:
# # This is a generic pipeline but actually no categorical features are available for this dataset

# unused_feat = []

# features = [ col for col in train.columns if col not in unused_feat+[target]] 

# cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

# cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
if os.getenv("CI", False):
# Take only a subsample to run CI
    X_train = train[features].values[train_indices][:1000,:]
    y_train = train[target].values[train_indices][:1000]
else:
    X_train = train[features].values[train_indices]
    y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [None]:
print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}")
print(f"X_valid.shape: {X_valid.shape}, y_valid.shape: {y_valid.shape}")
print(f"X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")

# Network parameters

In [None]:
from pytorch_tabnet.pretraining import TabNetPretrainer

In [None]:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax', # "sparsemax",
    n_shared_decoder=1, # nb shared glu for decoding
    n_indep_decoder=1, # nb independent glu for decoding
#     grouped_features=[[0, 1]], # you can group features together here
    verbose=5,
)

# Self Supervised Training

In [None]:
max_epochs = 100 if not os.getenv("CI", False) else 2 # 1000
# max_epochs = 10 if not os.getenv("CI", False) else 2

In [None]:
# unsupervised_model.fit(
#     X_train=X_train,
#     eval_set=[X_valid],
#     max_epochs=max_epochs , patience=5,
#     batch_size=2048, virtual_batch_size=128,
#     num_workers=0,
#     drop_last=False,
#     pretraining_ratio=0.5,
# ) 

unsupervised_model.fit(
    X_train=X_train,
    eval_set=[X_valid],
    max_epochs=max_epochs , patience=0,
    batch_size=2048, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.5,
) 

In [None]:
# Make reconstruction from a dataset
reconstructed_X, embedded_X = unsupervised_model.predict(X_valid)
assert(reconstructed_X.shape==embedded_X.shape)

In [None]:
# Obtain class-balanced samples from the training data
train_balanced = train.groupby(target, group_keys=False).apply(lambda x: x.sample(n=500, random_state=42))

X_train_balanced = train_balanced[features].values
y_train_balanced = train_balanced[target].values

# Extract embeddings using the pretrained model
_, embedded_X_train_balanced = unsupervised_model.predict(X_train_balanced)

# Verify the shape of the extracted embeddings
print("Embeddings shape:", embedded_X_train_balanced.shape)
print("Number of samples:", len(y_train_balanced))

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1)
tsne_embeddings = tsne.fit_transform(embedded_X_train_balanced)

In [None]:
from VAT_library.iVAT import iVAT
from scipy.spatial.distance import cdist

pairwise_dist = cdist(tsne_embeddings, tsne_embeddings)

RiV, RV, reordering_mat = iVAT(pairwise_dist)

plt.imshow(RiV, cmap='gray')
plt.savefig('tabnet_test3.png')

In [None]:
pairwise_dist = cdist(embedded_X_train_balanced, embedded_X_train_balanced)

RiV, RV, reordering_mat = iVAT(pairwise_dist)

plt.imshow(RiV, cmap='gray')
plt.savefig('tabnet_test3_raw_embeddings.png')

In [None]:
unsupervised_explain_matrix, unsupervised_masks = unsupervised_model.explain(X_valid)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(unsupervised_masks[i][:50])
    axs[i].set_title(f"mask {i}")


## Save and load the same way as other TabNet models

In [None]:
unsupervised_model.save_model('./test_pretrain3')
loaded_pretrain = TabNetPretrainer()
loaded_pretrain.load_model('./test_pretrain3.zip')

# Training

In [None]:
clf = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-3),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax', # This will be overwritten if using pretrain model
                       verbose=5,
                      )

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
    from_unsupervised=loaded_pretrain,
    
) 

In [None]:
# plot losses
plt.plot(clf.history['loss'])

In [None]:
# plot auc
plt.plot(clf.history['train_auc'])
plt.plot(clf.history['valid_auc'])

In [None]:
# plot learning rates
plt.plot(clf.history['lr'])

## Predictions

In [None]:
preds = clf.predict_proba(X_test)
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)


preds_valid = clf.predict_proba(X_valid)
valid_auc = roc_auc_score(y_score=preds_valid[:,1], y_true=y_valid)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

In [None]:
# check that best weights are used
assert np.isclose(valid_auc, np.max(clf.history['valid_auc']), atol=1e-6)

# Save and load Model

In [None]:
# save tabnet model
saving_path_name = "./tabnet_model_test_1"
saved_filepath = clf.save_model(saving_path_name)

In [None]:
# define new model with basic parameters and load state dict weights
loaded_clf = TabNetClassifier()
loaded_clf.load_model(saved_filepath)

In [None]:
loaded_preds = loaded_clf.predict_proba(X_test)
loaded_test_auc = roc_auc_score(y_score=loaded_preds[:,1], y_true=y_test)

print(f"FINAL TEST SCORE FOR {dataset_name} : {loaded_test_auc}")

In [None]:
assert(test_auc == loaded_test_auc)

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")
