In [None]:
# default_exp transforms.tabular

# transforms.tabular

> Contains all the transforms relevent to deployment in the `fastai` tabular library

Check to make sure tabular is installed, if not throw error:

In [None]:
#export
from fastinference_pytorch.soft_dependencies import SoftDependencies
if not SoftDependencies.check()['tab']:
    raise ImportError("The tabular module is not installed.")

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import torch
from torch import tensor
from fastcore.utils import store_attr

In [None]:
from fastinference_pytorch.rebuild import load_data
data = load_data('../')
data.keys()

dict_keys(['Encoder', 'Normalize', 'FillMissing', 'Categorify', 'Categorize'])

For an example we'll use the first five rows of the `ADULT_SAMPLE` dataset, which I have converted to a `NumPy` array below:

In [None]:
#export
class Encoder():
    """
    Single class which handles tabular pre-processing. Will extract
    all relevent information from `dictionary` needed for transformations
    
    Arguments:
    `dictionary`: dict, export from `fastinference`
    """
    
    can_decode,order = True, 1
    def __init__(self, dictionary):
        self.fm = dictionary['FillMissing']
        self.categorify = dictionary['Categorify']
        self.norm = dictionary['Normalize']
        self.encoder = dictionary['Encoder']
        for var in self.categorify['classes']:
            self.categorify['classes'][var][np.nan] = 0
        self.tensorize = Tensorize(self.encoder)
        
    def __call__(self, x, decode=False):
        if not decode:
            x = self._fill_missing(x)
            x = self._categorify(x)
            x = self._normalize(x)
            x = self.tensorize(x)
            return x
    
    def _fill_missing(self, x):
        "Fills in mising data in `conts` and potentially generates a new categorical column"
        for idx, name in self.encoder['conts'].items():
            if name in self.fm['na_dict'].keys():
                nan = np.argwhere(x[:,idx]!=x[:,idx])
                x[:,idx][nan] = self.fm['na_dict'][name]
            if self.fm['add_col']:
                x = np.append(x, np.expand_dims(x[:,idx]==x[:,idx],1), 1)
        return x
    
    def _categorify(self, x):
        "Encodes categorical data in `x` based on `self.categorify"
        for idx, name in self.encoder['cats'].items():
            x[:,idx] = [self.categorify['classes'][name][i] for i in x[:,idx]]
        return x
    
    def _normalize(self, x):
        "Normalize continous data in `x` based on `self.normalize`"
        for idx, name in self.encoder['conts'].items():
            x[:,idx] = (x[:,idx]-self.norm['means'][name])/self.norm['stds'][name]
        return x

In [None]:
#export
class Tensorize():
    def __init__(self, enc:Encoder):
        """
        Converts numpy array to a `tensor`.
        
        Params:
        
        `enc`: Encoder exported from `fastinference`
        """
        cat_idxs = list(enc['cats'].keys())
        cont_idxs = list(enc['conts'].keys())
        store_attr(self, 'cat_idxs, cont_idxs')
    
    def __call__(self, x):
        cat = np.take(x, self.cat_idxs, axis=1).astype('int')
        cont = np.take(x, self.cont_idxs, axis=1).astype('float')
        return tensor(cat), tensor(cont)

In [None]:
#slow
import pandas as pd
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv').to_numpy()

In [None]:
bs = 512

In [None]:
enc = Encoder(data)
t_df = enc(df)

In [None]:
#export
class NumpyDataset():
    def __init__(self, cats, conts,bs):
        "A simply dataset for NumPy after grouping"
        store_attr(self, 'cats,conts,bs')
        self.n_batches = len(cats) // self.bs + (0 if len(cats)%self.bs == 0 else 1)
    def __getitem__(self, idx): return (self.cats[idx:idx+self.bs], self.conts[idx:idx+self.bs])
    
    def __len__(self): return self.n_batches

In [None]:
from torch.utils.data import DataLoader

In [None]:
dset = NumpyDataset(*t_df, 128)

In [None]:
dl = DataLoader(dset, batch_size=1)

In [None]:
o = next(iter(dl))

In [None]:
o

[tensor([[[ 5,  8,  3,  0,  6,  5,  2],
          [ 5, 13,  1,  5,  2,  5,  2],
          [ 5, 12,  1,  0,  5,  3,  2],
          [ 6, 15,  3, 11,  1,  2,  2],
          [ 7,  6,  3,  9,  6,  3,  2],
          [ 5, 12,  5,  7,  4,  5,  2],
          [ 5, 16,  1,  0,  3,  5,  2],
          [ 5,  2,  3,  0,  1,  5,  2],
          [ 5, 12,  3,  4,  1,  5,  2],
          [ 6, 12,  3,  0,  1,  5,  2],
          [ 5, 10,  5,  0,  4,  3,  2],
          [ 5,  2,  5,  2,  4,  5,  2],
          [ 5,  9,  3,  0,  6,  5,  2],
          [ 5, 10,  3,  0,  1,  5,  2],
          [ 5,  9,  3, 13,  1,  5,  2],
          [ 5, 12,  7,  0,  5,  5,  2],
          [ 5,  1,  3,  8,  1,  5,  2],
          [ 5, 13,  5,  0,  2,  5,  2],
          [ 8, 13,  1,  0,  2,  5,  2],
          [ 5, 10,  3, 11,  1,  5,  2],
          [ 5, 16,  3,  0,  6,  3,  2],
          [ 5, 12,  5,  7,  4,  3,  2],
          [ 5,  7,  1, 13,  2,  5,  2],
          [ 5, 12,  4,  0,  4,  3,  2],
          [ 5, 12,  3,  4,  1,  5,  2],


In [None]:
for batch in dset:
    print(batch[0][0])

tensor([5, 8, 3, 0, 6, 5, 2])
tensor([ 5, 16,  5,  0,  4,  3,  2])
tensor([ 6, 10,  3,  0,  1,  5,  2])
tensor([5, 2, 5, 0, 4, 5, 2])
tensor([ 5, 12,  3,  0,  6,  5,  2])
tensor([ 2, 10,  3,  0,  1,  5,  2])
tensor([ 5, 12,  1,  0,  2,  5,  2])
tensor([ 5, 13,  5,  0,  2,  2,  2])
tensor([ 7, 12,  5,  9,  3,  5,  2])
tensor([ 3, 16,  3,  9,  1,  5,  2])
tensor([ 5, 12,  3,  5,  1,  5,  2])
tensor([ 5, 16,  5,  4,  2,  5,  2])
tensor([ 5, 16,  3,  4,  1,  5,  2])
tensor([5, 3, 5, 7, 4, 3, 2])
tensor([ 5, 12,  3, 15,  1,  5,  2])
tensor([ 1, 16,  5,  1,  2,  5,  2])
tensor([5, 5, 3, 4, 1, 5, 2])
tensor([ 7, 12,  3,  5,  1,  5,  2])
tensor([ 5, 10,  6, 13,  5,  5,  2])
tensor([1, 2, 3, 1, 1, 5, 2])
tensor([ 5, 13,  3, 11,  1,  5,  2])
tensor([ 5, 12,  6,  9,  2,  5,  2])
tensor([ 7, 12,  3,  6,  1,  5,  2])
tensor([5, 5, 1, 9, 5, 5, 2])
tensor([ 5, 12,  3,  4,  1,  5,  2])
tensor([ 5, 16,  5,  2,  2,  1,  2])
tensor([ 3, 12,  7, 11,  2,  5,  2])
tensor([ 6, 10,  1, 11,  2,  5,  2])
tensor

IndexError: index 0 is out of bounds for dimension 0 with size 0

In [None]:
t = Tensorize(enc)

In [None]:
t(row)

(tensor([[ 5,  8,  3,  0,  6,  5,  2],
         [ 5, 13,  1,  5,  2,  5,  2],
         [ 5, 12,  1,  0,  5,  3,  2],
         [ 6, 15,  3, 11,  1,  2,  2],
         [ 7,  6,  3,  9,  6,  3,  2],
         [ 5, 12,  5,  7,  4,  5,  2],
         [ 5, 16,  1,  0,  3,  5,  2],
         [ 5,  2,  3,  0,  1,  5,  2],
         [ 5, 12,  3,  4,  1,  5,  2],
         [ 6, 12,  3,  0,  1,  5,  2],
         [ 5, 10,  5,  0,  4,  3,  2],
         [ 5,  2,  5,  2,  4,  5,  2]]),
 tensor([[ 0.7624, -0.8343,  0.7527],
         [ 0.3966,  0.4530,  1.5365],
         [-0.0424, -0.8831, -0.0311],
         [-0.0424, -0.7247,  1.9283],
         [ 0.2503, -1.0151, -0.0311],
         [-1.3593, -1.1965, -0.4229],
         [ 0.7624, -1.3750, -0.0311],
         [-0.1156, -0.4767, -1.2067],
         [ 0.5429,  1.3224, -0.4229],
         [-0.1887,  0.2625, -0.0311],
         [-1.1398,  3.2330,  1.1446],
         [-1.5056,  0.2585, -0.0311]], dtype=torch.float64))

In [None]:
%%time
_ = np.take(arr, list(enc.encoder['cats'].keys())).astype('int')

CPU times: user 30 µs, sys: 0 ns, total: 30 µs
Wall time: 31.9 µs


In [None]:
enc.encoder['cats'].keys()

dict_keys([1, 3, 5, 6, 7, 8, 15])

In [None]:
%%timeit
_ = np.append(arr, np.expand_dims(arr[:,4]==arr[:,4],1), 1)

3 ms ± 5.07 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
b = np.zeros((arr.shape[0], arr.shape[1]+1), dtype=np.object)
b[:len(arr), :-1] = arr

2.92 ms ± 4.47 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
#export
def FillMissing(arr, procs):
    "Fills in missing data in `conts` and potentially generates a new categorical column"
    for idx, name in procs['Inputs']['conts'].items():
        if name in procs['FillMissing']['na_dict'].keys():
            nan = np.argwhere(arr[:,idx]!=arr[:,idx])
            arr[:,idx][nan] = procs['FillMissing']['na_dict'][name]
        if procs['FillMissing']['add_col']:
            arr = np.append(arr, np.expand_dims(arr[:,4]==arr[:,4],1), 1)
    return arr

In [None]:
show_doc(FillMissing)

<h4 id="FillMissing" class="doc_header"><code>FillMissing</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>FillMissing</code>(**`arr`**, **`procs`**)

Fills in missing data in `conts` and potentially generates a new categorical column

`arr` is expected to be a `NumPy` array, while `procs` should be the pre-processing dictionary exported after training

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k'], dtype=object)

In [None]:
#slow
df = FillMissing(df, procs)

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k', True, True, True], dtype=object)

Three `bool` columns were added at the end for our potential missing numerical values (if `True` they exist)

In [None]:
#export
def Categorize(arr, procs):
    "Encodes categorical data in `arr` based on `procs`"
    for idx, name in procs['Inputs']['cats'].items():
        arr[:,idx] = [procs['Categorize'][name][i] for i in arr[:,idx]]
    return arr

In [None]:
show_doc(Categorize)

<h4 id="Categorize" class="doc_header"><code>Categorize</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>Categorize</code>(**`arr`**, **`procs`**)

Encodes categorical data in `arr` based on `procs`

`arr` is expected to be a `NumPy` array, while `procs` should be the pre-processing dictionary exported after training

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k', True, True, True], dtype=object)

In [None]:
#slow
df = Categorize(df, procs)

In [None]:
#slow
df[0]

array([49, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0, 1902, 40,
       ' United-States', '>=50k', 2, True, True], dtype=object)

Our categorical variables are now all converted to integers. Any left as strings are not used by the model and are ignored at inference time.

In [None]:
#export
def Normalize(arr, procs):
    "Normalizes continous data in `arr` based on `procs`"
    for idx, name in procs['Inputs']['conts'].items():
        arr[:,idx] = (arr[:,idx]-procs['Normalize'][name]['mean'])/procs['Normalize'][name]['std']
        return arr

In [None]:
show_doc(Normalize)

<h4 id="Normalize" class="doc_header"><code>Normalize</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>Normalize</code>(**`arr`**, **`procs`**)

Normalizes continous data in `arr` based on `procs`

`arr` is expected to be a `NumPy` array, while `procs` should be the pre-processing dictionary exported after training

In [None]:
#slow
df[0]

array([49, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0, 1902, 40,
       ' United-States', '>=50k', 2, True, True], dtype=object)

In [None]:
#slow
df = Normalize(df, procs)

In [None]:
#slow
df[0]

array([0.7634343827572744, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0,
       1902, 40, ' United-States', '>=50k', 2, True, True], dtype=object)

Our continous variables have now been adjusted for the model

In [None]:
#export
def apply_procs(arr, procs):
    "Apply test-time pre-processing on `NumPy` array input"
    arr = FillMissing(arr, procs)
    arr = Categorize(arr, procs)
    arr = Normalize(arr, procs)
    return arr

In [None]:
show_doc(apply_procs)

<h4 id="apply_procs" class="doc_header"><code>apply_procs</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>apply_procs</code>(**`arr`**, **`procs`**)

Apply test-time pre-processing on `NumPy` array input

The specific order in which the pre-processing is done must occur, as `Categorify` can increase by a few columns from `FillMissing` if multiple `is_na` columns are added

In [None]:
#slow
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
df = df.head().to_numpy()

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k'], dtype=object)

In [None]:
#slow
df = apply_procs(df, procs)

In [None]:
#slow
df[0]

array([0.7634343827572744, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0,
       1902, 40, ' United-States', '>=50k', 2, True, True], dtype=object)

In [None]:
#export
class TabularDataset():
    "A tabular `PyTorch` dataset based on `procs` with batch size `bs` on `device`"
    def __init__(self, arr, procs, bs=64, device='cuda'):
        "Stores array, grabs the indicies for `cats` and `conts`, and generates batches"
        self.arr = arr
        self.cat_idxs = procs['Inputs']['cats'].keys()
        self.cont_idxs = procs['Inputs']['conts'].keys()
        self.bs = bs
        self.device = device
        self.make_batches()
        
    def __getitem__(self, x):
        "Grabs one batch of data and converts it to the proper type"
        row = [self.batches[x][:, list(self.cat_idxs)], self.batches[x][:, list(self.cont_idxs)]]
        row[0] = tensor(row[0].astype(np.int64)).to(self.device)
        row[1] = tensor(row[1].astype(np.float32)).to(self.device)
        return row
        
    def make_batches(self):
        "Splits data into equal sized batches, excluding the final partial"
        n_splits = len(self.arr)//self.bs
        last = len(self.arr) - (len(self.arr) - (n_splits * self.bs))
        if len(self.arr) > self.bs:
            arrs = np.split(self.arr[:last], n_splits)
            arrs.append(self.arr[last:])
        else:
            arrs = [self.arr]
        self.batches = arrs
        
    def __len__(self): return len(self.arr)//self.bs + (0 if len(self.arr)%self.bs==0 else 1)

In [None]:
show_doc(TabularDataset)

<h2 id="TabularDataset" class="doc_header"><code>class</code> <code>TabularDataset</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>TabularDataset</code>(**`arr`**, **`procs`**, **`bs`**=*`64`*, **`device`**=*`'cuda'`*)

A tabular `PyTorch` dataset based on `procs` with batch size `bs` on `device`

In [None]:
show_doc(TabularDataset.__init__)

<h4 id="TabularDataset.__init__" class="doc_header"><code>TabularDataset.__init__</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>TabularDataset.__init__</code>(**`arr`**, **`procs`**, **`bs`**=*`64`*, **`device`**=*`'cuda'`*)

Stores array, grabs the indicies for `cats` and `conts`, and generates batches

In [None]:
show_doc(TabularDataset.make_batches)

<h4 id="TabularDataset.make_batches" class="doc_header"><code>TabularDataset.make_batches</code><a href="__main__.py#L20" class="source_link" style="float:right">[source]</a></h4>

> <code>TabularDataset.make_batches</code>()

Splits data into equal sized batches, excluding the final partial

In [None]:
#slow
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
df = df.head().to_numpy()
df = apply_procs(df, procs)
dset = TabularDataset(df, procs)

In [None]:
#slow
dset[0]

[tensor([[ 5,  8,  3,  0,  6,  5,  2],
         [ 5, 13,  1,  5,  2,  5,  2],
         [ 5, 12,  1,  0,  5,  3,  1],
         [ 6, 15,  3, 11,  1,  2,  2],
         [ 7,  6,  3,  9,  6,  3,  1]], device='cuda:0'),
 tensor([[ 7.6343e-01,  1.0132e+05,  1.2000e+01],
         [ 3.9687e-01,  2.3675e+05,  1.4000e+01],
         [-4.3010e-02,  9.6185e+04,  1.0000e+01],
         [-4.3010e-02,  1.1285e+05,  1.5000e+01],
         [ 2.5024e-01,  8.2297e+04,  1.0000e+01]], device='cuda:0')]

In [None]:
#export
class tabular_learner():
    "A `Learner`-like wrapper for tabular data"
    def __init__(self, data_fn, model_fn):
        "Accepts a `data_fn` and a `model_fn` corresponding to the named picle exports"
        map_location = 'cpu' if not torch.cuda.is_available() else 'cuda'
        self.model = torch.load(model_fn, map_location=map_location)
        self.model.eval()
        with open(data_fn, 'rb') as handle:
            self.procs = pickle.load(handle)
            for proc in self.procs['Categorize']:
                self.procs['Categorize'][proc][np.nan] = 0 # we can't pickle np.nan
            
    def test_dl(self, test_items, bs=64):
        "Applies `procs` to `test_items`"
        dl = apply_procs(test_items, self.procs)
        return TabularDataset(dl, self.procs, bs=bs)
    
    def predict(self, inps):
        "Predict a single tensor"
        with torch.no_grad():
            outs = self.model(*inps)
        outs = np.argmax(outs.cpu().numpy(), axis=1)
        outs = [learn.procs['Outputs'][i] for i in outs]
        return outs
    
    def get_preds(self, dl=None):
        "Predict on multiple batches of data in `dl`"
        outs = []
        for i, batch in enumerate(dl):
            outs += self.predict(batch)
        return outs

In [None]:
show_doc(tabular_learner)

<h2 id="tabular_learner" class="doc_header"><code>class</code> <code>tabular_learner</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>tabular_learner</code>(**`data_fn`**, **`model_fn`**)

A `Learner`-like wrapper for tabular data

In [None]:
show_doc(tabular_learner.__init__)

<h4 id="tabular_learner.__init__" class="doc_header"><code>tabular_learner.__init__</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.__init__</code>(**`data_fn`**, **`model_fn`**)

Accepts a `data_fn` and a `model_fn` corresponding to the named picle exports

In [None]:
#slow
learn = tabular_learner('procs.pkl', 'model.pkl')

In [None]:
show_doc(tabular_learner.test_dl)

<h4 id="tabular_learner.test_dl" class="doc_header"><code>tabular_learner.test_dl</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.test_dl</code>(**`test_items`**, **`bs`**=*`64`*)

Applies `procs` to `test_items`

In [None]:
#slow
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
dl = learn.test_dl(df.iloc[:5].to_numpy())

In [None]:
#slow
dl[0]

[tensor([[ 5,  8,  3,  0,  6,  5,  2],
         [ 5, 13,  1,  5,  2,  5,  2],
         [ 5, 12,  1,  0,  5,  3,  1],
         [ 6, 15,  3, 11,  1,  2,  2],
         [ 7,  6,  3,  9,  6,  3,  1]], device='cuda:0'),
 tensor([[ 7.6343e-01,  1.0132e+05,  1.2000e+01],
         [ 3.9687e-01,  2.3675e+05,  1.4000e+01],
         [-4.3010e-02,  9.6185e+04,  1.0000e+01],
         [-4.3010e-02,  1.1285e+05,  1.5000e+01],
         [ 2.5024e-01,  8.2297e+04,  1.0000e+01]], device='cuda:0')]

In [None]:
show_doc(tabular_learner.predict)

<h4 id="tabular_learner.predict" class="doc_header"><code>tabular_learner.predict</code><a href="__main__.py#L19" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.predict</code>(**`inps`**)

Predict a single tensor

In [None]:
#slow
learn.predict(dl[0])

['<50k', '<50k', '<50k', '<50k', '<50k']

In [None]:
show_doc(tabular_learner.get_preds)

<h4 id="tabular_learner.get_preds" class="doc_header"><code>tabular_learner.get_preds</code><a href="__main__.py#L27" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.get_preds</code>(**`dl`**=*`None`*)

Predict on multiple batches of data in `dl`

In [None]:
#slow
learn.get_preds(dl=dl)

['<50k', '<50k', '<50k', '<50k', '<50k']