In [None]:
# default_exp numpy

# 00_Numpy

> Building an example `Dataset` and `DataLoader` with `NumPy`

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai2.tabular.all import *

For our data we'll first utilize `TabularPandas` for pre-processing. One potential is to use `TabularPandas` for pre-processing only, or to integrate `NumPy` directly into it

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
splits = RandomSplitter()(range_of(df))

We'll still build our regular `TabularPandas`, as we haven't done any `NumPy` modifications yet

In [None]:
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, splits=splits)

In [None]:
#export
class NumpyDataset():
    "A `Numpy` dataset object from `TabularPandas`"
    def __init__(self, to:TabularPandas):
        self.cats = to.cats.to_numpy().astype(np.long)
        self.conts = to.conts.to_numpy().astype(np.float32)
        self.ys = to.ys.to_numpy()
    
    def __getitem__(self, idx): 
        idx = idx[0]
        return self.cats[idx:idx+self.bs], self.conts[idx:idx+self.bs], self.ys[idx:idx+self.bs]
    
    def __len__(self): return len(self.cats)

In [None]:
ds = NumpyDataset(to)

In [None]:
ds.bs = 3

In [None]:
a,b,c = ds[[0]]
test_eq(len(a), 3)

In [None]:
#export
class NumpyDataLoader(DataLoader):
    def __init__(self, dataset, bs=1, **kwargs):
        "A `DataLoader` for a `NumpyDataset`"
        super().__init__(dataset, bs=bs, **kwargs)
        self.dataset.bs = bs
        
    def create_item(self, s): return s
    
    def create_batch(self, b):
        cat, cont, y = self.dataset[b]
        return tensor(cat).to(self.device), tensor(cont).to(self.device), tensor(y).to(self.device)

In [None]:
dl = NumpyDataLoader(ds, bs=3)

In [None]:
batch = next(iter(dl))

In [None]:
test_eq(len(dl), len(ds)//3+1)

In [None]:
#export
@patch
def shuffle_fn(x:NumpyDataLoader):
    "Shuffle the interior dataset"
    rng = np.random.permutation(len(x.dataset))
    x.dataset.cats = x.dataset.cats[rng]
    x.dataset.conts = x.dataset.conts[rng]
    x.dataset.ys = x.dataset.ys[rng]

In [None]:
#export
@patch
def get_idxs(x:NumpyDataLoader):
    "Get index's to select"
    idxs = Inf.count if x.indexed else Inf.nones
    if x.n is not None: idxs = list(range(len(x.dataset)))
    if x.shuffle: x.shuffle_fn()
    return idxs

To ensure that we still see an improvement, we'll compare timings

In [None]:
train_ds = NumpyDataset(to.train)
valid_ds = NumpyDataset(to.valid)

In [None]:
train_dl = NumpyDataLoader(train_ds, bs=64, shuffle=True, drop_last=True)
valid_dl = NumpyDataLoader(valid_ds, bs=64)

In [None]:
dls = to.dataloaders(bs=64)

In [None]:
%%timeit
# Numpy
for _ in train_dl: pass

31.2 ms ± 35.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
# fastai
for _ in dls[0]: pass

1.02 s ± 784 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
# Numpy
for _ in valid_dl: pass

7.35 ms ± 12.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
# fastai
for _ in dls[1]: pass

250 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# export
class NumpyDataLoaders(DataLoaders):
    def __init__(self, to, bs=64, val_bs=None, shuffle_train=True, device='cpu', **kwargs):
        train_ds = NumpyDataset(to.train)
        valid_ds = NumpyDataset(to.valid)
        val_bs = bs if val_bs is None else val_bs
        train = NumpyDataLoader(train_ds, bs=bs, shuffle=shuffle_train, device=device, drop_last=True, **kwargs)
        valid = NumpyDataLoader(valid_ds, bs=val_bs, shuffle=False, device=device, **kwargs)
        super().__init__(train, valid, device=device, **kwargs)