In [1]:
%load_ext autoreload
%autoreload 2 
#default_exp utils

In [None]:
#exporti
from fastai.tabular.all import * 
from tabnet.model import * 

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Model creating functions

### classifier

In [None]:
#export
@delegates(TabNetBase.__init__)
def TabNetClassifier(head_func, to, **kwargs):
    return TabNet(head_func, emb_szs=get_emb_sz(to), n_cont=len(to.cont_names), n_out=to.c, **kwargs)

### self supervised

In [None]:
#export
@delegates(TabNetBase.__init__)
def TabNetSelfSupervised(head_func, to, bs=1024, **kwargs):
    n_out = len(get_emb_sz(to)) + len(to.cont_names)
    return TabNet(head_func, emb_szs=get_emb_sz(to), n_cont=len(to.cont_names), n_out=n_out, **kwargs)

# Self Supervised Data Loader 

In [None]:
#exporti
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o

In [None]:
#export
class ReadTabBatchIdentity(ItemTransform):
    
    def __init__(self, to): store_attr()
        
    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float())
        res = res + res #
        if to.device is not None: res = to_device(res, to.device)
        return res 
    
    
    def decodes(self, o):
        o = o[0:2]
        o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
        vals = np.concatenate(o, axis=1)
        try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
        except: df = pd.DataFrame(vals, columns=self.to.x_names)
        to = self.to.new(df)
        return to

In [None]:
#export
class TabularPandasIdentity(TabularPandas): pass 

In [None]:
#export
@delegates()
class TabDataLoaderIdentity(TabDataLoader):
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

TabularPandasIdentity._dl_type = TabDataLoaderIdentity

# Experiment Helpers 

In [None]:
#export
def tabular_pandas(df, cat_names, cont_names, y_names, val_pct=0.2, tabular_type=TabularPandas):
    splits = RandomSplitter(valid_pct=val_pct)(range_of(df))
    to = tabular_type(df, procs=[Categorify, FillMissing,Normalize], cont_names=cont_names, cat_names=cat_names,
                           y_names=y_names, splits=splits, y_block=CategoryBlock())
    return to

In [None]:
#export 
@delegates(TabNetClassifier)
def tabnet_df_classifier(df, cat_names, cont_names, y_names, val_pct, head=linear_head, cbs=[], enc=None, **kwargs):
    to = tabular_pandas(df, cat_names, cont_names, y_names, val_pct=val_pct)
    dls = to.dataloaders(bs=kwargs['bs'])
    model = TabNetClassifier(head, to, **kwargs)
    if enc is not None: model.enc = enc
    cbs=[SetPrior(), MaskRegularizer(kwargs['lambda_sparse']), *cbs]
    return Learner(dls, model, CrossEntropyLossFlat(), cbs=cbs, metrics=[accuracy])

In [None]:
#export
@delegates(TabNetSelfSupervised)
def tabnet_df_self_sup(df, cat_names, cont_names, y_names, val_pct, head=tabnet_decoder, 
                       loss_func=MaskReconstructionLoss(), cbs=[], curriculum=False, p=0.8, **kwargs):
    to = tabular_pandas(df, cat_names, cont_names, y_names, tabular_type=TabularPandasIdentity, val_pct=val_pct)
    dls = to.dataloaders(bs=kwargs['bs'])
    dls.n_inp = 2
    cbs = [SetPrior(), TabularMasking(p=p, curriculum=curriculum), MaskRegularizer(kwargs['lambda_sparse']), *cbs]
    model = TabNetSelfSupervised(head, to, **kwargs)
    return Learner(dls, model, cbs=cbs, loss_func=loss_func, metrics=[mse])

In [None]:
#export 
@delegates(tabnet_df_self_sup)
def score_before_after_ss(df, ds_params, val_pct, decoder_head, loss_func, cycle_lr, **kwargs):
    print(kwargs)
    learn = tabnet_df_classifier(df, **ds_params, val_pct=val_pct, **kwargs)
    learn.dls.train.bs = learn.dls.train.n//2 if learn.dls.train.n < learn.dls.bs else learn.dls.bs
    learn.fit_one_cycle(*cycle_lr[0])
    before = accuracy(*learn.get_preds())
    
    learn_ss = tabnet_df_self_sup(df, **ds_params, val_pct=0.2, head=decoder_head, loss_func=loss_func,
                                  **kwargs)
    learn_ss.fit_one_cycle(*cycle_lr[1])
    
    bs = learn.dls.train.n//2 if learn.dls.train.n < learn.dls.bs else learn.dls.bs
    mp = {**kwargs, 'virtual_batch_size':bs}
    learn = tabnet_df_classifier(df, **ds_params, val_pct=val_pct, enc=learn_ss.model.enc, **mp)
    learn.dls.train.bs = bs
    learn.fit_one_cycle(*cycle_lr[2])
    after = accuracy(*learn.get_preds())

    return (before, after)

# Load Data 

### Forest 

In [None]:
#exporti
from pathlib import Path
from fastai.tabular.all import * 

In [None]:
#exporti
def extract_gzip(file, dest=None):
    import gzip
    dest = dest or Path(dest)
    with gzip.open(file, 'rb') as f_in:
        with open(dest / file.stem, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
            
data_dir = Path('./data')

In [None]:
#export
def load_forest():
    forest_type_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
    forest_path = untar_data(forest_type_url, dest=data_dir, extract_func=extract_gzip)
    
    target = "Covertype"

    cat_names = [
        "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
        "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4",
        "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9",
        "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14",
        "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
        "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
        "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29",
        "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
        "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39",
        "Soil_Type40"
    ]

    cont_names = [
        "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points"
    ]

    feature_columns = (
        cont_names + cat_names + [target])

    params = dict(cont_names = cont_names, y_names = target, cat_names = cat_names)
    procs=[Categorify, FillMissing, Normalize]
    model_params = dict(n_d=64, n_a=64, n_steps=5, virtual_batch_size=512, gamma=1.5, bs=1024*16,
                        lambda_sparse=1e-4, momentum=0.7, n_shared_ft_blocks=2, n_independent_ft_blocks=2,
                        n_dec_steps=10, p=0.8, curriculum=True)
    
    df = pd.read_csv(forest_path, header=None, names=feature_columns)

    return df, params, procs, model_params

### Adult

In [None]:
#export
def load_adult():
    adult_path = untar_data(URLs.ADULT_SAMPLE)
    df = pd.read_csv(adult_path/'adult.csv')
    procs=[Categorify, FillMissing, Normalize]
    params = dict(cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'],
                cont_names = ['age', 'fnlwgt', 'education-num'], y_names='salary')
    model_params = dict(n_d=16, n_a=16, lambda_sparse=1e-4, bs=1024*4, 
                              virtual_batch_size=128, n_steps=5, gamma=1.5, n_shared_ft_blocks=2, n_independent_ft_blocks=2,
                        n_dec_steps=10, p=0.8, curriculum=True, momentum=0.98)

    return df, params, procs, model_params

# Export

In [5]:
from nbdev.export import notebook2script
notebook2script()

Converted 01_core.ipynb.
Converted 02_model.ipynb.
Converted 04_utils.ipynb.
Converted index.ipynb.
Converted results.ipynb.
Converted self_supervision.ipynb.
