In [0]:
!pip3 install torch===1.2.0 torchvision===0.4.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch===1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/30/57/d5cceb0799c06733eefce80c395459f28970ebb9e896846ce96ab579a3f1/torch-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (748.8MB)
[K     |████████████████████████████████| 748.9MB 23kB/s 
[?25hCollecting torchvision===0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/06/e6/a564eba563f7ff53aa7318ff6aaa5bd8385cbda39ed55ba471e95af27d19/torchvision-0.4.0-cp36-cp36m-manylinux1_x86_64.whl (8.8MB)
[K     |████████████████████████████████| 8.8MB 34.4MB/s 
Installing collected packages: torch, torchvision
  Found existing installation: torch 1.3.0+cu100
    Uninstalling torch-1.3.0+cu100:
      Successfully uninstalled torch-1.3.0+cu100
  Found existing installation: torchvision 0.4.1+cu100
    Uninstalling torchvision-0.4.1+cu100:
      Successfully uninstalled torchvision-0.4.1+cu100
Successfully installed torch-1.2.0 torchvision-

In [0]:
!pip install git+https://github.com/fastai/fastai_dev > /dev/null

  Running command git clone -q https://github.com/fastai/fastai_dev /tmp/pip-req-build-krat64vk


# Tabular + Test Sets

This notebook will explore tabular data and adding test sets, labeled and non-labelled

First let's import the libraries

In [0]:
from fastai2.data.all import *
from fastai2.tabular.core import *
from fastai2.tabular.model import *

Then we'll read in the `ADULT_SAMPLE` dataframe and section out a part of it as a test dataframe (`df_test`)

In [0]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()

Next, we'll define our variables, pre-processers, and our splits

In [0]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

And now we will create two `DataBunch` objects. One for training with, the other with our test set (which can be labelled now!)

In [0]:
to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)
to_test = TabularPandas(df_test, procs, cat_names, cont_names, y_names="salary")
# if splits is blank it will default as split_none() (from v1)

We want to set our train's shuffle to `False` as this is our "test" DataLoader

In [0]:
test_dl = TabDataLoader(to_test, bs=128, shuffle=False, drop_last=False)

In [0]:
dbunch = to.databunch()

Now we will make a tabular model (I copied the code as its not *quite* exported yet in the library)

In [0]:
from fastai2.tabular.core import *

In [0]:
class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None, embed_p=0., y_range=None, use_bn=True, bn_final=False):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        _layers = [BnDropLin(sizes[i], sizes[i+1], bn=use_bn and i!=0, p=p, act=a)
                       for i,(p,a) in enumerate(zip([0.]+ps,actns))]
        if bn_final: _layers.append(nn.BatchNorm1d(sizes[-1]))
        self.layers = nn.Sequential(*_layers)
    
    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        x = self.layers(x)
        if self.y_range is not None:
            x = (self.y_range[1]-self.y_range[0]) * torch.sigmoid(x) + self.y_range[0]
        return x

In [0]:
def get_emb_sz(to, sz_dict=None):
    "Get default embedding size from `TabularPreprocessor` `proc` or the ones in `sz_dict`"
    return [_one_emb_sz(to.procs.classes, n, sz_dict) for n in to.cat_names]

def _one_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

def emb_sz_rule(n_cat): 
    "Rule of thumb to pick embedding size corresponding to `n_cat`"
    return min(600, round(1.6 * n_cat**0.56))

In [0]:
model = TabularModel(get_emb_sz(to), len(to.cont_names), 2, [200,100])

In [0]:
from fastai2.learner import *
from fastai2.metrics import *
from fastai2.optimizer import *
from fastai2.callback.schedule import fit_one_cycle

In [0]:
opt_func = partial(Adam, wd=0.01, eps=1e-5)
learn = Learner(dbunch, model, CrossEntropyLossFlat(), opt_func=opt_func, metrics=accuracy)

In [0]:
learn.fit_one_cycle(1)

(#5) [0,0.41459986567497253,0.3640693128108978,0.8270000219345093,00:13]


Now that we've trained, let's look at how to do `get_preds` and `validate` with our test data!

We can pass in our `dbunch_test`'s dataloader (either `train_dl` or `valid_dl`) in the `dl` argument for both and it will operate on them!

In [0]:
learn.validate(dl=test_dl)

[0.3573826849460602, 0.8347591161727905]

In [0]:
preds = learn.get_preds(dl=test_dl)

Just to make sure, let's verify our `preds` and `validate()` match up!

In [0]:
preds

(tensor([[0.3941, 0.6059],
         [0.7603, 0.2397],
         [0.9382, 0.0618],
         ...,
         [0.4261, 0.5739],
         [0.6419, 0.3581],
         [0.6023, 0.3977]]), tensor([0, 0, 0,  ..., 1, 0, 0]))

In [0]:
accuracy(preds[0], preds[1])

tensor(0.8348)

And they do perfectly! 