# Data

## Dataset

In [1]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torchvision.transforms import *
from PIL import Image 
import pathlib

class ShapeDataset(Dataset):
    
    def __init__(self, root_dir, transform=transforms.Compose([Resize(256), RandomCrop(224), ToTensor()])):
        self.root_dir = root_dir
        self.transform = transform
        self.__init()
    
    def __init(self):
        self.jpg_files = [f for f in pathlib.Path(self.root_dir).glob('**/*.jpg') 
                          if '.ipynb_checkpoints' not in f.parts]
        self.class_to_idx = {clazz: i for i, clazz in enumerate(['circle', 'poly', 'rect'])}
        
    def __len__(self):
        return len(self.jpg_files)
    
    def __getitem__(self, idx):
        img_path = self.jpg_files[idx]
        
        image = Image.open(img_path)
        if self.transform:
            image = self.transform(image)
            
        clazz = self.class_to_idx[img_path.parts[2]]
        
        return image, clazz

In [2]:
dataset = ShapeDataset('./shapes/train')

In [3]:
len(dataset)

30

In [4]:
dataset.jpg_files

[PosixPath('shapes/train/rect/0026.jpg'),
 PosixPath('shapes/train/rect/0029.jpg'),
 PosixPath('shapes/train/rect/0024.jpg'),
 PosixPath('shapes/train/rect/0023.jpg'),
 PosixPath('shapes/train/rect/0021.jpg'),
 PosixPath('shapes/train/rect/0028.jpg'),
 PosixPath('shapes/train/rect/0025.jpg'),
 PosixPath('shapes/train/rect/0022.jpg'),
 PosixPath('shapes/train/rect/0027.jpg'),
 PosixPath('shapes/train/rect/0020.jpg'),
 PosixPath('shapes/train/poly/0012.jpg'),
 PosixPath('shapes/train/poly/0016.jpg'),
 PosixPath('shapes/train/poly/0011.jpg'),
 PosixPath('shapes/train/poly/0018.jpg'),
 PosixPath('shapes/train/poly/0015.jpg'),
 PosixPath('shapes/train/poly/0017.jpg'),
 PosixPath('shapes/train/poly/0010.jpg'),
 PosixPath('shapes/train/poly/0014.jpg'),
 PosixPath('shapes/train/poly/0013.jpg'),
 PosixPath('shapes/train/poly/0019.jpg'),
 PosixPath('shapes/train/circle/0008.jpg'),
 PosixPath('shapes/train/circle/0007.jpg'),
 PosixPath('shapes/train/circle/0006.jpg'),
 PosixPath('shapes/train/cir

In [5]:
for i in range(len(dataset)):
    print(i, type(dataset[i]), 
          type(dataset[i][0]),
          type(dataset[i][1]),
          dataset[i][1], 
          dataset[i][0].shape)

0 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
1 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
2 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
3 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
4 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
5 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
6 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
7 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
8 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
9 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 2 torch.Size([3, 224, 224])
10 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 1 torch.Size([3, 224, 224])
11 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 1 torch.Size([3, 224, 224])
12

## ImageFolder

In [6]:
from torchvision import datasets

transform = transforms.Compose([Resize(256), RandomCrop(224), ToTensor()])
image_folder = datasets.ImageFolder('./shapes/train', transform=transform)

In [7]:
image_folder.classes

['circle', 'poly', 'rect']

In [8]:
for clazz in image_folder.classes:
    print(clazz, image_folder.class_to_idx[clazz])

circle 0
poly 1
rect 2


In [9]:
len(image_folder)

30

In [10]:
for i in range(len(image_folder)):
    print(i, type(image_folder[i]), 
          type(image_folder[i][0]),
          type(image_folder[i][1]),
          image_folder[i][1], 
          image_folder[i][0].shape)

0 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
1 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
2 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
3 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
4 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
5 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
6 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
7 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
8 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
9 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 0 torch.Size([3, 224, 224])
10 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 1 torch.Size([3, 224, 224])
11 <class 'tuple'> <class 'torch.Tensor'> <class 'int'> 1 torch.Size([3, 224, 224])
12

## DataLoader

### Using a custom dataset

In [11]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=False, num_workers=4)

In [12]:
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, 
          type(sample_batched), len(sample_batched),
          type(sample_batched[0]))

0 <class 'list'> 2 <class 'torch.Tensor'>
1 <class 'list'> 2 <class 'torch.Tensor'>
2 <class 'list'> 2 <class 'torch.Tensor'>
3 <class 'list'> 2 <class 'torch.Tensor'>
4 <class 'list'> 2 <class 'torch.Tensor'>
5 <class 'list'> 2 <class 'torch.Tensor'>
6 <class 'list'> 2 <class 'torch.Tensor'>
7 <class 'list'> 2 <class 'torch.Tensor'>
8 <class 'list'> 2 <class 'torch.Tensor'>
9 <class 'list'> 2 <class 'torch.Tensor'>
10 <class 'list'> 2 <class 'torch.Tensor'>
11 <class 'list'> 2 <class 'torch.Tensor'>
12 <class 'list'> 2 <class 'torch.Tensor'>
13 <class 'list'> 2 <class 'torch.Tensor'>
14 <class 'list'> 2 <class 'torch.Tensor'>


In [13]:
for inputs, labels in dataloader:
    print(type(inputs), type(labels), ':', inputs.shape, labels.shape)

<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224,

In [14]:
for inputs, labels in dataloader:
    print(labels)

tensor([2, 2])
tensor([2, 2])
tensor([2, 2])
tensor([2, 2])
tensor([2, 2])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])


### Using ImageFolder

In [15]:
dataloader = DataLoader(image_folder, batch_size=2, shuffle=False, num_workers=4)

In [16]:
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, 
          type(sample_batched), len(sample_batched),
          type(sample_batched[0]))

0 <class 'list'> 2 <class 'torch.Tensor'>
1 <class 'list'> 2 <class 'torch.Tensor'>
2 <class 'list'> 2 <class 'torch.Tensor'>
3 <class 'list'> 2 <class 'torch.Tensor'>
4 <class 'list'> 2 <class 'torch.Tensor'>
5 <class 'list'> 2 <class 'torch.Tensor'>
6 <class 'list'> 2 <class 'torch.Tensor'>
7 <class 'list'> 2 <class 'torch.Tensor'>
8 <class 'list'> 2 <class 'torch.Tensor'>
9 <class 'list'> 2 <class 'torch.Tensor'>
10 <class 'list'> 2 <class 'torch.Tensor'>
11 <class 'list'> 2 <class 'torch.Tensor'>
12 <class 'list'> 2 <class 'torch.Tensor'>
13 <class 'list'> 2 <class 'torch.Tensor'>
14 <class 'list'> 2 <class 'torch.Tensor'>


In [17]:
for inputs, labels in dataloader:
    print(type(inputs), type(labels), ':', inputs.shape, labels.shape)

<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224, 224]) torch.Size([2])
<class 'torch.Tensor'> <class 'torch.Tensor'> : torch.Size([2, 3, 224,

In [18]:
for inputs, labels in dataloader:
    print(labels)

tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([2, 2])
tensor([2, 2])
tensor([2, 2])
tensor([2, 2])
tensor([2, 2])


### Loading tensors to device

In [19]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'device = {device}')

for inputs, labels in dataloader:
    inputs, labels = inputs.to(device), labels.to(device)
    print(labels)

device = cuda
tensor([0, 0], device='cuda:0')
tensor([0, 0], device='cuda:0')
tensor([0, 0], device='cuda:0')
tensor([0, 0], device='cuda:0')
tensor([0, 0], device='cuda:0')
tensor([1, 1], device='cuda:0')
tensor([1, 1], device='cuda:0')
tensor([1, 1], device='cuda:0')
tensor([1, 1], device='cuda:0')
tensor([1, 1], device='cuda:0')
tensor([2, 2], device='cuda:0')
tensor([2, 2], device='cuda:0')
tensor([2, 2], device='cuda:0')
tensor([2, 2], device='cuda:0')
tensor([2, 2], device='cuda:0')


## Datasets

Here are a few datasets accessible through the PyTorch API that you may load.

### MNIST

A handwritten digit [database](http://yann.lecun.com/exdb/mnist/).

In [None]:
from torchvision import datasets

data_root = './output/mnist'
num_workers = 4
batch_size = 64

dataset = datasets.MNIST(root=data_root, download=True, transform=None)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### Fashion MNIST

An `article` (clothing) [database](https://github.com/zalandoresearch/fashion-mnist).

In [None]:
data_root = './output/fasionmnist'
dataset = datasets.FashionMNIST(root=data_root, download=True, transform=None)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### KMNIST

Kuzushiji [database](https://github.com/rois-codh/kmnist) of classic Japanese characters.

In [None]:
data_root = './output/kmnist'
dataset = datasets.KMNIST(root=data_root, download=True, transform=None)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### EMNIST

A handwritten character digit [database](https://www.nist.gov/node/1298471/emnist-dataset).

In [None]:
data_root = './output/emnist'
dataset = datasets.EMNIST(root=data_root, split='byclass', download=True, transform=None)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### QMNIST

A handwritten character digit [database](https://github.com/facebookresearch/qmnist) based on `EMNIST`.

In [None]:
data_root = './output/qmnist'
dataset = datasets.QMNIST(root=data_root, download=True, transform=None)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### FakeData

A fake dataset that returns randomly generated images.

In [None]:
dataset = datasets.FakeData(size=1000, 
                            image_size=(3, 224, 224), 
                            num_classes=10, transform=None, 
                            target_transform=None, random_offset=0)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### LSUN

A [dataset](https://github.com/fyu/lsun) of 10 scene categories and 20 object categories.

In [None]:
data_root = './output/lsun'
dataset = datasets.LSUN(root=data_root, classes='train', transform=None)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)