In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np

import random

In this note you will learn about Dataloaders. This is what we will use to manage our datasets when training a model.

# Toy example - Data Loader from scratch

For now we will consider a toy example that creates data for Linear Regression. We will create a `DataModule` class that will call our dataloader for train and validation with the methods `train_dataloader` and `val_dataloader`.

In [None]:
class DataModule():

    def __init__(self, batch_size=64):
        self.batch_size = batch_size # size of the batches

    def get_dataloader(self, train):
        raise NotImplementedError

    def train_dataloader(self):
        # returns train dataloader
        return self.get_dataloader(train=True)

    def val_dataloader(self):
        # returns test dataloader
        return self.get_dataloader(train=False)

Now we want to create a new class `ExampleData` that inherits from `DataModule`. We want this class to return a dataset for Linear Regression. So, we will provide the hyperparamters `w`, `b` and `sigma`. Based on this, we will produce a set of random values X and $y = Xw + b$ with additional Gaussian noise.

1. Create `self.X` with with a random normal distribution of zero mean and variance 1. Create `self.y` with $y = Xw + b + \textrm{noise}$
2. Implement the dataloader method. This method takes as input the boolean variable `train`. When set to `True` we should return the training dataset. When set to `False` we should return the validation dataset.

In [None]:
class ExampleData(DataModule):

    def __init__(self, w, b, sigma=0.1, n_train_samples=500, n_val_samples=100, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Saving hyperparameters
        self.n_train_samples = n_train_samples # number of train samples
        self.n_val_samples = n_val_samples # number of validation samples
        self.n_total = n_train_samples + n_val_samples # total number of samples

        noise = torch.randn(self.n_total, 1) * sigma**2

        # Exercise - Create self.X and self.y

    def get_dataloader(self, train):
        if train:
            #Exercise - return the first n_train_samples indices shuffled
            indices =
            pass
        else:
            # Exercise - return indices from self.num_train_samples to the end
            indices =

        # return batches of data
        for i in range(0, len(indices), self.batch_size):
            batch_indices = torch.tensor(indices[i: i+self.batch_size])
            yield self.X[batch_indices], self.y[batch_indices]

We will now test your class with $w = [1,2,3]$ and $b = 5$

In [None]:
data = ExampleData(w=torch.tensor([1.0,2.0,3.0]), b=5.0, batch_size = 32)

You can get the next batch from the dataloader as

In [None]:
X, y = next(iter(data.train_dataloader()))
print('One batch of data')
print('Shape of X: ', X.shape, 'Shape of y: ', y.shape)
print('First item returned by our dataloader')
print('X:', X[0], '\ny:', y[0])

Try this with the validation dataloader

In [None]:
# Exercise


# DataLoader for FashionMNIST

The implementation of the DataLoader was purely didatic. PyTorch has a `DataLoader` class that we will use instead. We will test this with the FashionMNIST dataset. Below you can see how to load the train data.

In [None]:
import torchvision
from torchvision import datasets, transforms

# This is necessary to get our data as tensors and to normalize it. Do not worry too much about this part
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])
# Load the FashionMNIST dataset
train_data = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)

Now we can pass the `train_data` to the `DataLoader`. We define the `batch_size` and set `shuffle` to `True`, so that each time that we loop over the dataloader the samples will be shuffled.

In [None]:
dataloader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

We can iterate over the dataloader in the same way. It will return `X` and `y` just as our implementation above. Try it and look at the shapes of X and y.

In [None]:
# Exercise


Our data now consists of images. The FashionMNIST dataset contains 28×28 greyscale images of different clothes. 8 is number of samples (images) in our batch, 1 is the number of color channels, and 28×28 is the size of each image. Plot the first image of the dataset with `matplotlib.pyplot.imshow` with `cmap='Greys_r'`

In [None]:
# Exercise


The labels on our dataset are just numbers, what do they actually mean? See [this](https://github.com/zalandoresearch/fashion-mnist) for the label coding. Create a function that gives you the text labels when given the numeric indices in y.

In [None]:
def text_classes(indices):
  # Exercise return the text labels for indices



text_classes(y)

We are finally ready to create our `FashionMNIST` class.

In [None]:
class FashionMNIST(DataModule):

    def __init__(self, root, *args, **kwargs):
        super().__init__(*args, **kwargs)

        transform = transforms.Compose([transforms.ToTensor(),
                                        transforms.Normalize((0.5,), (0.5,))])

        # Exercise - Load the FashionMNIST train and validation dataset (do not create the dataloader here)
        self.train =
        self.val =

    def text_classes(self, indices):
        # Exercise - just copy your text_classes function


    def get_dataloader(self, train):
        # Exercise return train or validation dataloader according to the boolean variable train.
        # Remember that you can now use the DataLoader from PyTorch


    # ----------------------------------------------------------------------- #
    # do not change below
    # ----------------------------------------------------------------------- #
    def visualize(self, X, y, nrows=1, ncols=5):
        # this is already provided for visualization. it resembles the plot you created above
        labels = self.text_classes(y)
        self.show_images(X.squeeze(1), nrows, ncols, titles=labels)

    def show_images(self, imgs, num_rows, num_cols, titles, scale=1.5):
        # this is already provided for visualization
        figsize = (num_cols * scale, num_rows * scale)
        _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
        axes = axes.flatten()
        for i, (ax, img) in enumerate(zip(axes, imgs)):
            img = img.squeeze().numpy()
            ax.imshow(img, cmap='Greys_r')
            ax.axes.get_xaxis().set_visible(False)
            ax.axes.get_yaxis().set_visible(False)
            ax.set_title(titles[i])
        return axes

Now let's look at our new dataloader. If everything is correct your code should pass all the asserts

In [None]:
data = FashionMNIST(root='./data', batch_size=64)
X, y = next(iter(data.get_dataloader(train=True)))
assert X.shape == torch.Size([64,1,28,28])
assert y.shape == torch.Size([64])

X, y = next(iter(data.get_dataloader(train=False)))
assert X.shape == torch.Size([64,1,28,28])
assert y.shape == torch.Size([64])

Use the `visualize` method to plot some images

In [None]:
# Exercise


Our goal is to classify each of these images with the correct label from the 10 possible classes. In the next notebook we will build our model to do so.