# More PyTorch concepts

## Init, helpers, utils, ...

In [None]:
import matplotlib.pyplot as plt
from pprint import pprint
import numpy as np

from IPython.core.debugger import set_trace

%matplotlib inline

In [None]:
from ppt.utils import attr

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

# Dataset
`Dataset` gives you information about the number of samples (implement `__len__`) and gives you the sample at a given index (implement `__getitem__`.
It's a nice and simple abstraction to work with data.

In [None]:
from ppt.utils import DogsCatsDataset

In [None]:
train_ds = DogsCatsDataset("../data/raw", "sample/train")

In [None]:
!tree -d ../data/raw/dogscats/

In [None]:
train_ds

In [None]:
# the __len__ method
len(train_ds)

In [None]:
# the __getitem__ method
train_ds[0]

In [None]:
train_ds[14][0]

Optionally, datasets offer some convenience functions and attributes.
This is not enforced by the interface

In [None]:
train_ds.classes

In [None]:
train_ds.class_to_idx

In [None]:
train_ds.imgs

In [None]:
for img, label_id in train_ds:
    print(label_id, train_ds.classes[label_id])
    display(img)

# Transforms

Common image transformation that can be composed/chained.

In [None]:
from torchvision import transforms

IMG_SIZE = 224
_mean = [0.485, 0.456, 0.406]
_std = [0.229, 0.224, 0.225]


trans = transforms.Compose([
    transforms.RandomCrop(IMG_SIZE),
    # transforms.RandomHorizontalFlip(),
    # transforms.ColorJitter(.3, .3, .3),
    # transforms.ToTensor(),
    # transforms.Normalize(_mean, _std),
])

trans(train_ds[13][0])

Ref:
- https://pytorch.org/docs/stable/torchvision/transforms.htm
- https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
- https://github.com/mdbloice/Augmentor
- https://github.com/aleju/imgaug

Shout-out:
- Hig performance image augmentation with pillow-simd: https://github.com/uploadcare/pillow-simd and http://python-pillow.org/pillow-perf/
- [Improving Deep Learning Performance with AutoAugment](https://ai.googleblog.com/2018/06/improving-deep-learning-performance.html) ([paper](https://arxiv.org/abs/1805.09501) | [pytorch implementation](https://github.com/DeepVoltaire/AutoAugment))

# Dataloader
The `DataLoader` class offers batch loading of datasets with multi-processing and different sample strategies.

In [None]:
from torch.utils.data import DataLoader

DataLoader?

In [None]:
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4)

train_iter = iter(train_dl)
X, y = next(train_iter)

In [None]:
train_ds = DogsCatsDataset("../data/raw", "sample/train", transform=trans)
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4)

In [None]:
train_iter = iter(train_dl)
X, y = next(train_iter)

In [None]:
print("X:", X.shape)
print("y:", y.shape)

# Sampler
`Sampler` define **how** to sample from the dataset.

Examples:
- `SequentialSampler`
- `RandomSamples`
- `SubsetSampler`
- `WeightedRandomSampler`

Write your own by simply implementing `__iter__` to iterate over the indices of the dataset.

Ref:
- https://pytorch.org/docs/stable/data.html#torch.utils.data.sampler.Sampler

In [None]:
torch.utils.data.sampler.SubsetRandomSampler??

# Recap
- Transforms
- Dataset
- DataLoader
- Sampler

**simple but extensible interfaces**

# Exercise: write your own dataset
Extend the `DogsCatsDataset` such that you can specify the size of dataset, i.e. the number of samples.

In [None]:
from torch.utils.data import Dataset


class MyDataSet(Dataset):
    def __init__(self):
        super().__init__()
        
    def __len__(self):
        return 0
    
    def __getitem__(self, idx):
        return None