In [None]:
# default_exp tabular

# tabular

> Methods for the tabular models, including data preperation and model prediction

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import onnxruntime as ort

For an example we'll use the first five rows of the `ADULT_SAMPLE` dataset, which I have converted to a `NumPy` array below:

In [None]:
#slow
import pandas as pd
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
df = df.head().to_numpy()

For procs we will use the same ones from training a model:

* Note: we have to load in  `Categorize` to have `np.nan` as an index to work properly. This is done automatically later

In [None]:
#slow
import pickle
with open('procs.pkl', 'rb') as handle:
    procs = pickle.load(handle)
    for proc in procs['Categorize']:
        procs['Categorize'][proc][np.nan] = 0 # we can't pickle np.nan

In [None]:
#export
def FillMissing(arr, procs):
    "Fills in missing data in `conts` and potentially generates a new categorical column"
    for idx, name in procs['Inputs']['conts'].items():
        if name in procs['FillMissing']['na_dict'].keys():
            nan = np.argwhere(arr[:,idx]!=arr[:,idx])
            arr[:,idx][nan] = procs['FillMissing']['na_dict'][name]
        if procs['FillMissing']['add_col']:
            arr = np.append(arr, np.expand_dims(arr[:,4]==arr[:,4],1), 1)
    return arr

In [None]:
show_doc(FillMissing)

<h4 id="FillMissing" class="doc_header"><code>FillMissing</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>FillMissing</code>(**`arr`**, **`procs`**)

Fills in missing data in `conts` and potentially generates a new categorical column

`arr` is expected to be a `NumPy` array, while `procs` should be the pre-processing dictionary exported after training

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k'], dtype=object)

In [None]:
#slow
df = FillMissing(df, procs)

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k', True, True, True], dtype=object)

Three `bool` columns were added at the end for our potential missing numerical values (if `True` they exist)

In [None]:
#export
def Categorize(arr, procs):
    "Encodes categorical data in `arr` based on `procs`"
    for idx, name in procs['Inputs']['cats'].items():
        arr[:,idx] = [procs['Categorize'][name][i] for i in arr[:,idx]]
    return arr

In [None]:
show_doc(Categorize)

<h4 id="Categorize" class="doc_header"><code>Categorize</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>Categorize</code>(**`arr`**, **`procs`**)

Encodes categorical data in `arr` based on `procs`

`arr` is expected to be a `NumPy` array, while `procs` should be the pre-processing dictionary exported after training

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k', True, True, True], dtype=object)

In [None]:
#slow
df = Categorize(df, procs)

In [None]:
#slow
df[0]

array([49, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0, 1902, 40,
       ' United-States', '>=50k', 2, True, True], dtype=object)

Our categorical variables are now all converted to integers. Any left as strings are not used by the model and are ignored at inference time.

In [None]:
#export
def Normalize(arr, procs):
    "Normalizes continous data in `arr` based on `procs`"
    for idx, name in procs['Inputs']['conts'].items():
        arr[:,idx] = (arr[:,idx]-procs['Normalize'][name]['mean'])/procs['Normalize'][name]['std']
        return arr

In [None]:
show_doc(Normalize)

<h4 id="Normalize" class="doc_header"><code>Normalize</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>Normalize</code>(**`arr`**, **`procs`**)

Normalizes continous data in `arr` based on `procs`

`arr` is expected to be a `NumPy` array, while `procs` should be the pre-processing dictionary exported after training

In [None]:
#slow
df[0]

array([49, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0, 1902, 40,
       ' United-States', '>=50k', 2, True, True], dtype=object)

In [None]:
#slow
df = Normalize(df, procs)

In [None]:
#slow
df[0]

array([0.7634343827572744, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0,
       1902, 40, ' United-States', '>=50k', 2, True, True], dtype=object)

Our continous variables have now been adjusted for the model

In [None]:
#export
def apply_procs(arr, procs):
    "Apply test-time pre-processing on `NumPy` array input"
    arr = FillMissing(arr, procs)
    arr = Categorize(arr, procs)
    arr = Normalize(arr, procs)
    return arr

In [None]:
show_doc(apply_procs)

<h4 id="apply_procs" class="doc_header"><code>apply_procs</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>apply_procs</code>(**`arr`**, **`procs`**)

Apply test-time pre-processing on `NumPy` array input

The specific order in which the pre-processing is done must occur, as `Categorify` can increase by a few columns from `FillMissing` if multiple `is_na` columns are added

In [None]:
#slow
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
df = df.head().to_numpy()

In [None]:
#slow
df[0]

array([49, ' Private', 101320, ' Assoc-acdm', 12.0, ' Married-civ-spouse',
       nan, ' Wife', ' White', ' Female', 0, 1902, 40, ' United-States',
       '>=50k'], dtype=object)

In [None]:
#slow
df = apply_procs(df, procs)

In [None]:
#slow
df[0]

array([0.7634343827572744, 5, 101320, 8, 12.0, 3, 0, 6, 5, ' Female', 0,
       1902, 40, ' United-States', '>=50k', 2, True, True], dtype=object)

In [None]:
#export
class TabularDataset():
    "A tabular `NumPy` dataset based on `procs` with batch size `bs`"
    def __init__(self, arr, procs, bs=64):
        "Stores array, grabs the indicies for `cats` and `conts`, and generates batches"
        self.arr = arr
        self.cat_idxs = procs['Inputs']['cats'].keys()
        self.cont_idxs = procs['Inputs']['conts'].keys()
        self.bs = bs
        self.make_batches()
        
    def __getitem__(self, x):
        "Grabs one batch of data and converts it to the proper type"
        row = [self.batches[x][:, list(self.cat_idxs)], self.batches[x][:, list(self.cont_idxs)]]
        row[0] = row[0].astype(np.int64)
        row[1] = row[1].astype(np.float32)
        return row
        
    def make_batches(self):
        "Splits data into equal sized batches, excluding the final partial"
        n_splits = len(self.arr)//self.bs
        last = len(self.arr) - (len(self.arr) - (n_splits * self.bs))
        if len(self.arr) > self.bs:
            arrs = np.split(self.arr[:last], n_splits)
            arrs.append(self.arr[last:])
        else:
            arrs = [self.arr]
        self.batches = arrs
        
    def __len__(self): return len(self.arr)//self.bs + (0 if len(self.arr)%self.bs==0 else 1)

In [None]:
show_doc(TabularDataset)

<h2 id="TabularDataset" class="doc_header"><code>class</code> <code>TabularDataset</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>TabularDataset</code>(**`arr`**, **`procs`**, **`bs`**=*`64`*)

A tabular `NumPy` dataset based on `procs` with batch size `bs`

In [None]:
show_doc(TabularDataset.__init__)

<h4 id="TabularDataset.__init__" class="doc_header"><code>TabularDataset.__init__</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>TabularDataset.__init__</code>(**`arr`**, **`procs`**, **`bs`**=*`64`*)

Stores array, grabs the indicies for `cats` and `conts`, and generates batches

In [None]:
show_doc(TabularDataset.make_batches)

<h4 id="TabularDataset.make_batches" class="doc_header"><code>TabularDataset.make_batches</code><a href="__main__.py#L19" class="source_link" style="float:right">[source]</a></h4>

> <code>TabularDataset.make_batches</code>()

Splits data into equal sized batches, excluding the final partial

In [None]:
#slow
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
df = df.head().to_numpy()
df = apply_procs(df, procs)
dset = TabularDataset(df, procs)

In [None]:
dset[0]

[array([[ 5,  8,  3,  0,  6,  5,  2],
        [ 5, 13,  1,  5,  2,  5,  2],
        [ 5, 12,  1,  0,  5,  3,  1],
        [ 6, 15,  3, 11,  1,  2,  2],
        [ 7,  6,  3,  9,  6,  3,  1]]),
 array([[ 7.6343441e-01,  1.0132000e+05,  1.2000000e+01],
        [ 3.9686874e-01,  2.3674600e+05,  1.4000000e+01],
        [-4.3010049e-02,  9.6185000e+04,  1.0000000e+01],
        [-4.3010049e-02,  1.1284700e+05,  1.5000000e+01],
        [ 2.5024247e-01,  8.2297000e+04,  1.0000000e+01]], dtype=float32)]

In [None]:
#export
class tabular_learner():
    "A `Learner`-like wrapper for tabular data"
    def __init__(self, fn):
        "Accepts a `fn` pointing to exported `procs` and ONNX filename"
        self.ort_session = ort.InferenceSession(fn+'.onnx')
        try:
            self.ort_session.set_providers(['CUDAExecutionProvider'])
            cpu = False
        except:
            self.ort_session.set_providers(['CPUExecutionProvider'])
            cpu = True
        with open(f'{fn}.pkl', 'rb') as handle:
            self.procs = pickle.load(handle)
            for proc in self.procs['Categorize']:
                self.procs['Categorize'][proc][np.nan] = 0 # we can't pickle np.nan
            
    def test_dl(self, test_items, bs=64):
        "Applies `procs` to `test_items`"
        dl = apply_procs(test_items, self.procs)
        return TabularDataset(dl, self.procs, bs=bs)
    
    def predict(self, inps):
        "Predict a single numpy item"
        names = [i.name for i in self.ort_session.get_inputs()]
        xs = {name:x for name,x in zip(names,inps)}
        outs = self.ort_session.run(None, xs)
        outs = np.argmax(outs[0], axis=1)
        outs = [learn.procs['Outputs'][i] for i in outs]
        return outs
    
    def get_preds(self, dl=None):
        "Predict on multiple batches of data in `dl`"
        outs = []
        for i, batch in enumerate(dl):
            outs += self.predict(batch)
        return outs

In [None]:
show_doc(tabular_learner)

<h2 id="tabular_learner" class="doc_header"><code>class</code> <code>tabular_learner</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>tabular_learner</code>(**`fn`**)

A `Learner`-like wrapper for tabular data

In [None]:
show_doc(tabular_learner.__init__)

<h4 id="tabular_learner.__init__" class="doc_header"><code>tabular_learner.__init__</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.__init__</code>(**`fn`**)

Accepts a `fn` pointing to exported `procs` and ONNX filename

In [None]:
#slow
learn = tabular_learner('procs')

In [None]:
show_doc(tabular_learner.test_dl)

<h4 id="tabular_learner.test_dl" class="doc_header"><code>tabular_learner.test_dl</code><a href="__main__.py#L18" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.test_dl</code>(**`test_items`**, **`bs`**=*`64`*)

Applies `procs` to `test_items`

In [None]:
#slow
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
dl = learn.test_dl(df.iloc[:5].to_numpy())

In [None]:
#slow
dl[0]

[array([[ 5,  8,  3,  0,  6,  5,  2],
        [ 5, 13,  1,  5,  2,  5,  2],
        [ 5, 12,  1,  0,  5,  3,  1],
        [ 6, 15,  3, 11,  1,  2,  2],
        [ 7,  6,  3,  9,  6,  3,  1]]),
 array([[ 7.6343441e-01,  1.0132000e+05,  1.2000000e+01],
        [ 3.9686874e-01,  2.3674600e+05,  1.4000000e+01],
        [-4.3010049e-02,  9.6185000e+04,  1.0000000e+01],
        [-4.3010049e-02,  1.1284700e+05,  1.5000000e+01],
        [ 2.5024247e-01,  8.2297000e+04,  1.0000000e+01]], dtype=float32)]

In [None]:
show_doc(tabular_learner.predict)

<h4 id="tabular_learner.predict" class="doc_header"><code>tabular_learner.predict</code><a href="__main__.py#L23" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.predict</code>(**`inps`**)

Predict a single numpy item

In [None]:
#slow
learn.predict(dl[0])

['<50k', '<50k', '<50k', '<50k', '<50k']

In [None]:
show_doc(tabular_learner.get_preds)

<h4 id="tabular_learner.get_preds" class="doc_header"><code>tabular_learner.get_preds</code><a href="__main__.py#L32" class="source_link" style="float:right">[source]</a></h4>

> <code>tabular_learner.get_preds</code>(**`dl`**=*`None`*)

Predict on multiple batches of data in `dl`

In [None]:
#slow
learn.get_preds(dl=dl)

['<50k', '<50k', '<50k', '<50k', '<50k']