## Data Loading Components 
Three components in Data Loading Components:



1.   **Dataset** : Collection of data samples (index --> sample)
2.   **Sampler** : Specifiy data fetching order (iterate over data indicies)
3.   **Dataloader** : Handle data loading logic (collection of batches, single or multi-process loading.)

More details about dataloader is [here](https://pytorch.org/docs/stable/data.html)





In [0]:
import torch
import torchvision
from sklearn.datasets import load_wine
from torch.utils.data import Dataset, DataLoader
import math

### If, dataset is CSV

In [0]:
#Dataset for CSV

class CsvDataset(Dataset):

    def __init__(self):
        # load data and extract the required inputs.
        raw_data = load_wine()
        
        self.x_data = torch.tensor(raw_data['data'], dtype=torch.float32)
        self.y_data = torch.tensor(raw_data['target'], dtype=torch.float32)

        # sanity checks
        if self.x_data.shape[0] != self.y_data.shape[0]:
            raise ValueError('x and y data shape mismatch')
        self.n_samples = len(raw_data['data'])

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

# create dataset object
train_dataset = CsvDataset()

### If, data is in Pandas dataframe

In [0]:
import pandas as pd

# Pandas dataset 

class PdDataset(Dataset):

    def __init__(self):
        # read pandas dataframe
        raw_data = load_wine()
        df_data = pd.DataFrame(data=raw_data['data'],columns=raw_data['feature_names'])

        self.x_data = torch.tensor(df_data.values, dtype=torch.float32)
        self.y_data = torch.tensor(raw_data['target'], dtype=torch.float32)
        
        # sanity checks
        if self.x_data.shape[0] != self.y_data.shape[0]:
            raise ValueError('x and y data shape mismatch')
        self.n_samples = df_data.shape[0]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

# create dataset object
train_dataset = PdDataset()

### If, toy torchvision datasets:
Long list of datasets are [here](https://pytorch.org/docs/stable/torchvision/datasets.html)

In [4]:
# torchvision datasets

train_dataset = torchvision.datasets.MNIST(root='./data', 
                                           train=True, 
                                           transform=torchvision.transforms.ToTensor(),  
                                           download=True)

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw
Processing...
Done!


In [0]:
# handle whole dataset with DataLoader
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=5,
                          shuffle=True,
                          num_workers=2)

In [6]:
# make it iterable
dataiter = iter(train_loader)
trn = dataiter.next()
features_tensor, labels_tensor = trn
print(features_tensor, labels_tensor)
print('Tensor : {}x{}'.format(len(features_tensor), len(labels_tensor)))
print('Shapes : {} {}'.format(features_tensor.shape, labels_tensor.shape))

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
    

In [0]:

# demo training loop
# num_iter = total_sample / batch_size
num_epochs = 2
batch_sz = 5 # One of the important hyper parameter.

num_iterations = math.ceil(len(train_dataset)/batch_sz)

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        
        if (i+1) % 10 == 0:
            print('Epoch: {}/{}, Step {}/{}'.format(epoch+1, num_epochs, i+1, num_iterations))
