# data

> Fill in a module description here

In [None]:
#| default_exp data

In [None]:
#| export
from typing import (
    Iterable,
    Callable,
    Iterator,
    Optional,
    List
)
import minima as mi
from minima import Tensor
from minima import init
import fastcore.all as fc
import random

In [None]:
#| export
class Sampler:
    """
    A custom sampler class.

    Args:
        ds (Iterable[int]): Iterable of indices.
        shuffle (bool): Whether to shuffle the indices.

    Example:
        >>> x = range(10)
        >>> sampler = Sampler(x, shuffle=True)
    """

    def __init__(self, ds: Iterable[int], shuffle: bool = False):
        self.n = len(ds)
        self.shuffle = shuffle

    def __iter__(self) -> Iterator[int]:
        res = list(range(self.n))
        if self.shuffle: random.shuffle(res)
        return iter(res)

In [None]:
#| export
class BatchSampler:
    """
    A custom batch sampler class.

    Args:
        sampler (Sampler): The sampler to use.
        bs (int): Batch size.
        drop_last (bool): Whether to drop the last batch if it is smaller than the batch size.

    Example:
        >>> x = range(10)
        >>> sampler = Sampler(x, shuffle=True)
        >>> batch_sampler = BatchSampler(sampler, bs=3)
    """

    def __init__(self, sampler: Sampler, bs: int, drop_last: bool = False):
        self.sampler = sampler
        self.bs = bs
        self.drop_last = drop_last

    def __iter__(self):
        yield from fc.chunked(iter(self.sampler), self.bs, drop_last=self.drop_last)


In [None]:
#| export
class Dataset():
    r"""An abstract class representing a :class:`Dataset`.

    All datasets that represent a map from keys to data samples should subclass
    it. All subclasses should overwrite:
    `__getitem__`, supporting fetching a data sample for a given key.
    `__len__`, which is expected to return the size of the dataset.
    """

    def __init__(self, transforms: Optional[List]=None):
        self.transforms = transforms

    def __getitem__(self, index) -> object:
        """
        Get an item from the dataset at the given index.

        Args:
            i (int): Index of the item.

        Returns:
            Tuple[float, float]: A tuple containing the input data and target label at the given index.

        Example:
            >>> dataset[0]
            (1, 0)
        """
        
        raise NotImplementedError

    def __len__(self) -> int:
        """
        Get the length of the dataset.

        Returns:
            int: Length of the dataset.

        Example:
            >>> len(dataset)
            5
        """
        
        raise NotImplementedError

    def apply_transforms(self, x):
        if self.transforms is not None:
            for tfms in self.transforms:
                x = tfms(x)
        return x

In [None]:
#|export

class DataLoader:
    """
    A custom data loader class.

    Args:
        ds (Dataset): The dataset to load.
        bs (int): Batch size.

    Example:
        >>> dataloader = DataLoader(dataset, batch_size)
    """

    def __init__(self,
                 dataset: Dataset,
                 batch_size: int = 1,
                 shuffle: bool = True,
                 sampler: Sampler = None,
                 batch_sampler: BatchSampler = None,
                 num_workers: int = 0,
                 collate_fn: callable = None,
                 drop_last: bool = False):

        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.sampler = sampler if sampler else Sampler(dataset, shuffle)
        self.batch_sampler = batch_sampler if batch_sampler else BatchSampler(self.sampler, batch_size, drop_last)
        self.num_workers = num_workers # --> TODO: implement a multiprocessing DataLoader :3
        self.collate_fn = collate_fn
        self.drop_last = drop_last

    def __iter__(self):
        """
        Get an iterator over the DataLoader.

        Yields:
            Tuple[float, float]: A tuple containing a batch of input data and target labels.

        Example:
            >>> for batch in dataloader:
            >>>     # Process the batch
        """
        if self.collate_fn is not None:
            yield from (self.collate_fn(self.dataset[batch_idxs]) for batch_idxs in self.batch_sampler)
        else:
            yield from (self.dataset[batch_idxs] for batch_idxs in self.batch_sampler)

In [None]:
X = init.rand(100, 10)
Y = init.randb(X.shape[0])
X.shape, Y.shape

((100, 10), (100,))

In [None]:
class MiDataset(Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self) -> int:
        return self.x.shape[0]

    def __getitem__(self, i: int):
        return self.x[i], self.y[i]

In [None]:
ds = MiDataset(X,Y)

In [None]:
len(ds)

100

In [None]:
ds[:10]

(minima.Tensor(
 [[0.44827  0.972257 0.162093 0.651357 0.674643 0.058556 0.77044  0.114458 0.036035 0.724403]
  [0.279256 0.762198 0.098522 0.023235 0.061441 0.500864 0.882805 0.665565 0.655441 0.955076]
  [0.053199 0.073746 0.613605 0.955496 0.487865 0.366099 0.220528 0.171526 0.740024 0.818154]
  [0.759455 0.615499 0.838791 0.175651 0.496732 0.947632 0.906288 0.388928 0.570359 0.364755]
  [0.50476  0.035453 0.802756 0.834585 0.86217  0.033867 0.002718 0.381732 0.323061 0.690836]
  [0.714487 0.905631 0.675356 0.940675 0.476971 0.945839 0.143995 0.195337 0.354678 0.979314]
  [0.90923  0.103803 0.428037 0.422163 0.358182 0.225472 0.398135 0.49203  0.21131  0.908285]
  [0.905441 0.111027 0.678214 0.414372 0.153489 0.818518 0.380152 0.780927 0.337182 0.018802]
  [0.245062 0.033404 0.424691 0.865934 0.583683 0.369279 0.939267 0.837757 0.353129 0.2387  ]
  [0.392505 0.558509 0.37986  0.237742 0.799824 0.041586 0.015288 0.975757 0.467785 0.111904]]),
 minima.Tensor(
 [ True  True False  True

## Export

In [None]:
import nbdev; nbdev.nbdev_export()