## Data Loading Components 
Three components in Data Loading Components:



1.   **Dataset** : Collection of data samples (index --> sample)
2.   **Sampler** : Specifiy data fetching order (iterate over data indicies)
3.   **Dataloader** : Handle data loading logic (collection of batches, single or multi-process loading.)

More details about dataloader is [here](https://pytorch.org/docs/stable/data.html)





In [0]:
import torch
import torchvision
from sklearn.datasets import load_wine
from torch.utils.data import Dataset, DataLoader
import math

### If, dataset is CSV

In [0]:
#Dataset for CSV

class CsvDataset(Dataset):

    def __init__(self):
        # load data and extract the required inputs.
        raw_data = load_wine()
        
        self.x_data = torch.tensor(raw_data['data'], dtype=torch.float32)
        self.y_data = torch.tensor(raw_data['target'], dtype=torch.float32)

        # sanity checks
        if self.x_data.shape[0] != self.y_data.shape[0]:
            raise ValueError('x and y data shape mismatch')
        self.n_samples = len(raw_data['data'])

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

# create dataset object
train_dataset = CsvDataset()

### If, data is in Pandas dataframe

In [0]:
import pandas as pd

# Pandas dataset 

class PdDataset(Dataset):

    def __init__(self):
        # read pandas dataframe
        raw_data = load_wine()
        df_data = pd.DataFrame(data=raw_data['data'],columns=raw_data['feature_names'])

        self.x_data = torch.tensor(df_data.values, dtype=torch.float32)
        self.y_data = torch.tensor(raw_data['target'], dtype=torch.float32)
        
        # sanity checks
        if self.x_data.shape[0] != self.y_data.shape[0]:
            raise ValueError('x and y data shape mismatch')
        self.n_samples = df_data.shape[0]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

# create dataset object
train_dataset = PdDataset()

### If, toy torchvision datasets:
Long list of datasets are [here](https://pytorch.org/docs/stable/torchvision/datasets.html)

In [0]:
# torchvision datasets

train_dataset = torchvision.datasets.MNIST(root='./data', 
                                           train=True, 
                                           transform=torchvision.transforms.ToTensor(),  
                                           download=True)

In [0]:
# handle whole dataset with DataLoader
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=5,
                          shuffle=True,
                          num_workers=2)

In [4]:
# make it iterable
dataiter = iter(train_loader)
trn = dataiter.next()
features_tensor, labels_tensor = trn
print(features_tensor, labels_tensor)
print('Tensor : {}x{}'.format(len(features_tensor), len(labels_tensor)))
print('Shapes : {} {}'.format(features_tensor.shape, labels_tensor.shape))

tensor([[1.4390e+01, 1.8700e+00, 2.4500e+00, 1.4600e+01, 9.6000e+01, 2.5000e+00,
         2.5200e+00, 3.0000e-01, 1.9800e+00, 5.2500e+00, 1.0200e+00, 3.5800e+00,
         1.2900e+03],
        [1.3280e+01, 1.6400e+00, 2.8400e+00, 1.5500e+01, 1.1000e+02, 2.6000e+00,
         2.6800e+00, 3.4000e-01, 1.3600e+00, 4.6000e+00, 1.0900e+00, 2.7800e+00,
         8.8000e+02],
        [1.4100e+01, 2.1600e+00, 2.3000e+00, 1.8000e+01, 1.0500e+02, 2.9500e+00,
         3.3200e+00, 2.2000e-01, 2.3800e+00, 5.7500e+00, 1.2500e+00, 3.1700e+00,
         1.5100e+03],
        [1.3860e+01, 1.3500e+00, 2.2700e+00, 1.6000e+01, 9.8000e+01, 2.9800e+00,
         3.1500e+00, 2.2000e-01, 1.8500e+00, 7.2200e+00, 1.0100e+00, 3.5500e+00,
         1.0450e+03],
        [1.2850e+01, 3.2700e+00, 2.5800e+00, 2.2000e+01, 1.0600e+02, 1.6500e+00,
         6.0000e-01, 6.0000e-01, 9.6000e-01, 5.5800e+00, 8.7000e-01, 2.1100e+00,
         5.7000e+02]]) tensor([0., 0., 0., 0., 2.])
Tensor : 5x5
Shapes : torch.Size([5, 13]) torch.Si

In [5]:

# demo training loop
# num_iter = total_sample / batch_size
num_epochs = 2
batch_sz = 5 # One of the important hyper parameter.

num_iterations = math.ceil(len(train_dataset)/batch_sz)

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        
        if (i+1) % 10 == 0:
            print('Epoch: {}/{}, Step {}/{}'.format(epoch+1, num_epochs, i+1, num_iterations))


Epoch: 1/2, Step 10/36
Epoch: 1/2, Step 20/36
Epoch: 1/2, Step 30/36
Epoch: 2/2, Step 10/36
Epoch: 2/2, Step 20/36
Epoch: 2/2, Step 30/36
