In [None]:
# default_exp numpy

# 00_Numpy

> Building an example `Dataset` and `DataLoader` with `NumPy`

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai2.tabular.all import *

For our data we'll first utilize `TabularPandas` for pre-processing. One potential is to use `TabularPandas` for pre-processing only, or to integrate `NumPy` directly into it

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
splits = RandomSplitter()(range_of(df))

We'll still build our regular `TabularPandas`, as we haven't done any `NumPy` modifications yet

In [None]:
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, splits=splits)

In [None]:
#export
class NumpyDataset():
    "A `Numpy` dataset object from `TabularPandas`"
    def __init__(self, to:TabularPandas):
        self.cats = to.cats.to_numpy().astype(np.long)
        self.conts = to.conts.to_numpy().astype(np.float32)
        self.ys = to.ys.to_numpy()
    
    def __getitem__(self, idx): 
        idx = idx[0]
        return self.cats[idx:idx+self.bs], self.conts[idx:idx+self.bs], self.ys[idx:idx+self.bs]
    
    def __len__(self): return len(self.cats)

In [None]:
ds = NumpyDataset(to)

In [None]:
ds.bs = 3

In [None]:
a,b,c = ds[[0]]
test_eq(len(a), 3)

In [None]:
#export
class NumpyDataLoader(DataLoader):
    def __init__(self, dataset, bs=1, **kwargs):
        "A `DataLoader` for a `NumpyDataset`"
        super().__init__(dataset, bs=bs, **kwargs)
        self.dataset.bs = bs
        
    def create_item(self, s): return s
    
    def create_batch(self, b):
        cat, cont, y = self.dataset[b]
        return tensor(cat).to(self.device), tensor(cont).to(self.device), tensor(y).to(self.device)

In [None]:
dl = NumpyDataLoader(ds, bs=3)

In [None]:
batch = next(iter(dl))

In [None]:
test_eq(len(dl), len(ds)//3+1)

In [None]:
#export
@patch
def shuffle_fn(x:NumpyDataLoader):
    "Shuffle the interior dataset"
    rng = np.random.permutation(len(x.dataset))
    x.dataset.cats = x.dataset.cats[rng]
    x.dataset.conts = x.dataset.conts[rng]
    x.dataset.ys = x.dataset.ys[rng]

In [None]:
#export
@patch
def get_idxs(x:NumpyDataLoader):
    "Get index's to select"
    idxs = Inf.count if x.indexed else Inf.nones
    if x.n is not None: idxs = list(range(len(x.dataset)))
    if x.shuffle: x.shuffle_fn()
    return idxs

To ensure that we still see an improvement, we'll compare timings

In [None]:
train_ds = NumpyDataset(to.train)
valid_ds = NumpyDataset(to.valid)

In [None]:
train_dl = NumpyDataLoader(train_ds, bs=64, shuffle=True, drop_last=True)
valid_dl = NumpyDataLoader(valid_ds, bs=64)

In [None]:
dls = to.dataloaders(bs=64)

In [None]:
%%timeit
# Numpy
for _ in train_dl: pass

31.2 ms ± 35.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
# fastai
for _ in dls[0]: pass

1.02 s ± 784 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
# Numpy
for _ in valid_dl: pass

7.35 ms ± 12.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
# fastai
for _ in dls[1]: pass

250 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# export
class NumpyDataLoaders(DataLoaders):
    def __init__(self, to, bs=64, val_bs=None, shuffle_train=True, device='cpu', **kwargs):
        train_ds = NumpyDataset(to.train)
        valid_ds = NumpyDataset(to.valid)
        val_bs = bs if val_bs is None else val_bs
        train = NumpyDataLoader(train_ds, bs=bs, shuffle=shuffle_train, device=device, drop_last=True, **kwargs)
        valid = NumpyDataLoader(valid_ds, bs=val_bs, shuffle=False, device=device, **kwargs)
        super().__init__(train, valid, device=device, **kwargs)

In [None]:
df_np = df.to_numpy()

In [None]:
col_names = df.columns

In [None]:
idx_2_col = dict(zip(range(len(col_names)), col_names)); idx_2_col
col_2_idx = {v: k for k, v in idx_2_col.items()}

In [None]:
cat_idxs = [col_2_idx[name] for name in cat_names]
cont_idxs = [col_2_idx[name] for name in cont_names]
y_idxs = [col_2_idx[name] for name in [y_names]]


In [None]:
class NumpyFillMissing(TabularProc):
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr(self, 'fill_strategy,add_col,fill_vals')
    def setups(self, dsets):
        missing = [np.isnan(np.sum(df_np[:,idx])) for idx in cont_idxs]
        self.na_dict = {n:self.fill_strategy(dsets[n])}

['age', 'fnlwgt', 'education-num']

In [None]:
fs = FillStrategy.median
missing = [np.isnan(np.sum(df_np[:,idx])) for idx in cont_idxs]; missing

[False, False, True]

In [None]:
missing = {}
for idx in cont_idxs:
    missing[idx] = np.isnan(np.sum(df_np[:,idx]))

In [None]:
missing

{0: False, 2: False, 4: True}

In [None]:
missing

{0: False, 2: False, 4: True}

In [None]:
fs

<function fastai2.tabular.core.FillStrategy.median(c, fill)>

In [None]:
missing.keys()

dict_keys([0, 2, 4])

In [None]:
{n:fs(df_np[:,n], defaultdict(int))
for n in missing.keys()}

AttributeError: 'numpy.ndarray' object has no attribute 'median'

In [None]:
np.all(df_np[:,cont_idxs], axis=1).isnan()

AttributeError: 'numpy.ndarray' object has no attribute 'isnan'

In [None]:
df_np[:,cont_idxs]

array([[49, 101320, 12.0],
       [44, 236746, 14.0],
       [38, 96185, nan],
       ...,
       [53, 157069, 12.0],
       [32, 217296, 9.0],
       [26, 182308, 10.0]], dtype=object)

In [None]:
class TabularNumpy(CollBase, GetAttr, FilteredBase):
    _default, with_cont='procs',True
    def __init__(self, df):
        super().__init__(df)

In [None]:
t_np = TabularNumpy(df_np)

In [None]:
len(t_np)

32561

In [None]:
class TabularNumpy(Tabular):
    _default, with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
        self.df = df[cat_names+cont_names+y_names].to_numpy()
        self.idx_2_col = dict(zip(range(len(df.columns)), df.columns))
        self.col_2_idx = {v: k for k, v in self.idx_2_col.items()}
        

In [None]:
class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    _default,with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
        if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
        if not inplace: df = df.copy()
        if splits is not None: df = df.iloc[sum(splits, [])]
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        super().__init__(df)

        self.y_names,self.device = L(y_names),device
        if y_block is None and self.y_names:
            # Make ys categorical if they're not numeric
            ys = df[self.y_names]
            if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): y_block = CategoryBlock()
            else: y_block = RegressionBlock()
        if y_block is not None and do_setup:
            if callable(y_block): y_block = y_block()
            procs = L(procs) + y_block.type_tfms
        self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
        self.split = len(df) if splits is None else len(splits[0])
        if reduce_memory:
            if len(self.cat_names) > 0: self.reduce_cats()
            if len(self.cont_names) > 0: self.reduce_conts()
        if do_setup: self.setup()

    def new(self, df):
        return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))

    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def decode(self): return self.procs.decode(self)
    def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def reduce_cats(self): self.train[self.cat_names] = self.train[self.cat_names].astype('category')
    def reduce_conts(self): self[self.cont_names] = self[self.cont_names].astype(np.float32)
    def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
    def setup(self): self.procs.setup(self)
    def process(self): self.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def x_names (self): return self.cat_names + self.cont_names
    def n_subsets(self): return 2
    def y(self): return self[self.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self

    def all_col_names (self):
        ys = [n for n in self.y_names if n in self.items.columns]
        return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names