In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
from fastai.tabular.all import * 
from tabnet.utils import *
from tabnet.model import *

# Abstract 

Tabular data problems are still very prevalent in today's world, especially in big corporations that amass large amounts of data for analysis. 

Even though this domain is popular, it's not as widely researched as computer vision, audio etc. For example, there are [papers](https://arxiv.org/abs/1604.07379) using self-supervised learning in CV problems as far back as 2016, while the first known one for Tabular data has been released in August 2019. 

For these reasons, I wanted to implement a self-supervised approach for Tabular Data. 

My goals were to: 
1. Test if self-supervised learning for tabular data speeds up the training process and by how much, and if using a `Curricular Learning` improves the self-supervised learning process.  
1. Test if a modle trained in a self-supervised fashion gives better results in a highly imbalanced data set

To do so I've implemented a relatively new (Aug 2019) SOTA Tabular Data DL model called [Tabnet](#https://arxiv.org/pdf/1908.07442.pdf). I've also taken the time to learn the [fastai framework](https://docs.fast.ai/) (a DL framework implemented using `pytorch`) for this project which helped me decouple the different parts and run experiments efficiently. 

I've tested this approach on 3 different datasets: 
1. Adult Census Income - where the task is to distinguish whether a person's income is above $50,000
1. Forest Cover - classifying the forst cover type from cartographic variables.
1. Poker Hand - classifying the poker hand from the raw suit and rank attributes of the cards.

In [3]:
def tabnet_df_classifier(df, cat_names, cont_names, y_names, tabnet_args, enc=None, cbs=[]):
    splits = RandomSplitter(valid_pct=0.2)(range_of(df))
    to = TabularPandas(df, procs=[Categorify, FillMissing,Normalize], cont_names=cont_names, cat_names=cat_names,
                           y_names=y_names, splits=splits, y_block=CategoryBlock())
    dls = to.dataloaders(bs=tabnet_args['bs'])
    model = TabNetClassifier(linear_head, to, **tabnet_args)
    if enc is not None: model.enc = enc
    return Learner(dls, model, CrossEntropyLossFlat(), cbs=[SetPrior(), MaskRegularizer(), *cbs], metrics=[accuracy])

In [4]:
def tabnet_df_self_sup(df, cat_names, cont_names, y_names, tabnet_args, cbs=[]):
    splits = RandomSplitter(valid_pct=0.2)(range_of(df))
    to = TabularPandasIdentity(df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names=y_names,
                   splits=splits)
    dls = to.dataloaders(bs=tabnet_args['bs'])
    dls.n_inp = 2
    
    model = TabNetSelfSupervised(tabnet_decoder, to, **tabnet_args)
    return Learner(dls, model, cbs=[SetPrior(), TabularMasking(), MaskRegularizer(), *cbs], 
                   loss_func=MaskReconstructionLoss())

In [5]:
def score_before_after_ss(df, ds_params, model_params, cycle_lr=[(10, 1e-1/2)]*3):
    learn = tabnet_df_classifier(df, **adult_params, tabnet_args=adult_model_params)
    learn.fit_one_cycle(*cycle_lr[0]) 
    before = learn.get_preds()
    
    learn_ss = tabnet_df_self_sup(df, **adult_params, tabnet_args=adult_model_params)
    learn_ss.fit_one_cycle(*cycle_lr[1])
    
    learn = tabnet_df_classifier(df, **adult_params, tabnet_args=adult_model_params, enc=learn_ss.model.enc)
    learn.fit_one_cycle(*cycle_lr[2]) 
    after = learn.get_preds()

    return (before, after)

# Adult 

In [6]:
adult_path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(adult_path/'adult.csv')
adult_params = dict(cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'],
            cont_names = ['age', 'fnlwgt', 'education-num'], y_names='salary')
adult_model_params = dict(n_d=16, n_a=16, lambda_sparse=1e-4, bs=1024*4, 
                          virtual_batch_size=128, n_steps=5, gamma=1.5)

### Exp

In [None]:
%%capture 
res = L([score_before_after_ss(df, adult_params, adult_model_params, ) for i in range(10)])

In [None]:
before = res.itemgot(0).map(lambda b: accuracy(*b))
after = res.itemgot(1).map(lambda b: accuracy(*b))

pd.DataFrame({'before': before, 'after': after}).agg(['mean', 'std'])

In [None]:
%%capture 
res = L([score_before_after_ss(df, adult_params, adult_model_params, cycle_lr=[(20, 1e-1/2)]*3) for i in range(10)])

In [None]:
before = res.itemgot(0).map(lambda b: accuracy(*b))
after = res.itemgot(1).map(lambda b: accuracy(*b))

pd.DataFrame({'before': before, 'after': after}).agg(['mean', 'std'])

### Before Self Supervision 

In [7]:
learn = tabnet_df_classifier(df, **adult_params, tabnet_args=adult_model_params)

In [8]:
learn.fit_one_cycle(10, slice(1e-3, 1e-1))

epoch,train_loss,valid_loss,accuracy,time
0,0.0,00:00,,


RuntimeError: running_mean should contain 42 elements not 16

### Self Supervision 

In [None]:
learn = tabnet_df_self_sup(df, **adult_params, tabnet_args=adult_model_params)

In [None]:
learn.fit_one_cycle(10, 1e-1/2)

### After Self Supervision 

In [None]:
learn = tabnet_df_classifier(df, **adult_params, tabnet_args=adult_model_params, enc=learn.model.enc)

In [None]:
learn.fit_one_cycle(10, 1e-1/2)

# Forest Cover DS

In [None]:
data_dir = Path('./data')

In [None]:
def extract_gzip(file, dest=None):
    import gzip
    dest = dest or Path(dest)
    with gzip.open(file, 'rb') as f_in:
        with open(dest / file.stem, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [None]:
forest_type_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
forest_path = untar_data(forest_type_url, dest=data_dir, extract_func=extract_gzip)

In [None]:
target = "Covertype"

cat_names = [
    "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
    "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4",
    "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9",
    "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14",
    "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
    "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
    "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29",
    "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
    "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39",
    "Soil_Type40"
]

cont_names = [
    "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

feature_columns = (
    cont_names + cat_names + [target])

forest_params = dict(cont_names = cont_names, y_names = target, cat_names = cat_names)
procs=[Categorify, FillMissing, Normalize]
forest_model_params = dict(n_d=64, n_a=64, n_steps=5, virtual_batch_size=512, gamma=1.5, bs=1024*16, lambda_reg=1e-5)

In [None]:
df = pd.read_csv(forest_path, header=None, names=feature_columns).sample(n=200_000)
df.shape

### Before Self Supervision

In [None]:
learn = tabnet_df_classifier(df, **forest_params, tabnet_args=forest_model_params)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(20, 1e-1)

### Self Supervision 

In [None]:
learn_enc = tabnet_df_self_sup(df, **forest_params, tabnet_args=forst_model_params)

In [None]:
learn_enc.lr_find()

In [None]:
learn_enc.fit_one_cycle(20, 1e-1/2)

### After Self Supervision 

In [None]:
learn = tabnet_df_classifier(df, **forest_params, tabnet_args=forst_model_params, enc=learn_enc.model.enc)

In [None]:
learn.fit_one_cycle(20, 1e-1/2)

# Poker Hand DS

In [None]:
BASE_DIR = Path.home().joinpath('data/tabnet/poker')

In [None]:
df = pd.read_csv(BASE_DIR.joinpath('train.csv'))
df.head()
df.shape

In [None]:
cat_names = ['S1', 'S2', 'S3', 'S4', 'S5', 'C1', 'C2', 'C3', 'C4', 'C5']
cont_names = []
target = 'hand'

poker_params = dict(cat_names=cat_names, cont_names=cont_names, y_names=target)
poker_model_params = dict(n_a=16, n_d=16, lambda_reg=0, bs=64*4, virtual_batch_size=256, n_steps=5, gamma=1.5)

### Before Self Supervision

In [None]:
learn = tabnet_df_classifier(df, **poker_params, tabnet_args=poker_model_params)

In [None]:
learn.lr = 3e-2

In [None]:
learn.fit_one_cycle(1000)

### Self Supervision 

In [None]:
learn_enc = tabnet_df_self_sup(df, **poker_params, tabnet_args=poker_model_params)

In [None]:
learn_enc.lr_find()

In [None]:
learn_enc.fit_one_cycle(20, 1e-1/2)

### After Self Supervision 

In [None]:
learn = tabnet_df_classifier(df, **poker_params, tabnet_args=poker_model_params, enc=learn_enc.model.enc, bs=1024*5)

In [None]:
learn.fit_one_cycle(20, 1e-1/2)