## Data Loading Components 
Three components in Data Loading Components:



1.   **Dataset** : Collection of data samples (index --> sample)
2.   **Sampler** : Specifiy data fetching order (iterate over data indicies)
3.   **Dataloader** : Handle data loading logic (collection of batches, single or multi-process loading.)

More details about dataloader is [here](https://pytorch.org/docs/stable/data.html)





In [0]:
import torch
import torchvision
from sklearn.datasets import load_wine
from torch.utils.data import Dataset, DataLoader
import math

### If, dataset is CSV

In [0]:
#Dataset for CSV

class CsvDataset(Dataset):

    def __init__(self):
        # load data and extract the required inputs.
        raw_data = load_wine()
        
        self.x_data = torch.tensor(raw_data['data'], dtype=torch.float32)
        self.y_data = torch.tensor(raw_data['target'], dtype=torch.float32)
        print('x shape {}'.format(self.x_data.shape))

        # sanity checks
        if self.x_data.shape[0] != self.y_data.shape[0]:
            raise ValueError('x and y data shape mismatch')
        self.n_samples = len(raw_data['data'])

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

# create dataset object
train_dataset = CsvDataset()


x shape torch.Size([178, 13])


### If, data is in Pandas dataframe

In [0]:
import pandas as pd

# Pandas dataset 

class PdDataset(Dataset):

    def __init__(self):
        # read pandas dataframe
        raw_data = load_wine()
        df_data = pd.DataFrame(data=raw_data['data'],columns=raw_data['feature_names'])

        self.x_data = torch.tensor(df_data.values, dtype=torch.float32)
        self.y_data = torch.tensor(raw_data['target'], dtype=torch.float32)
        
        # sanity checks
        if self.x_data.shape[0] != self.y_data.shape[0]:
            raise ValueError('x and y data shape mismatch')
        self.n_samples = df_data.shape[0]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

# create dataset object
train_dataset = PdDataset()

### If, toy torchvision datasets:
Long list of datasets are [here](https://pytorch.org/docs/stable/torchvision/datasets.html)

In [0]:
# torchvision datasets

train_dataset = torchvision.datasets.MNIST(root='./data', 
                                           train=True, 
                                           transform=torchvision.transforms.ToTensor(),  
                                           download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw
Processing...



Done!




In [0]:
# handle whole dataset with DataLoader
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=5,
                          shuffle=True,
                          num_workers=2)

In [0]:
# make it iterable
dataiter = iter(train_loader)
trn = dataiter.next()
features_tensor, labels_tensor = trn
print(features_tensor, labels_tensor)
print('Tensor : {}x{}'.format(len(features_tensor), len(labels_tensor)))
print('Shapes : {} {}'.format(features_tensor.shape, labels_tensor.shape))

tensor([[1.3170e+01, 2.5900e+00, 2.3700e+00, 2.0000e+01, 1.2000e+02, 1.6500e+00,
         6.8000e-01, 5.3000e-01, 1.4600e+00, 9.3000e+00, 6.0000e-01, 1.6200e+00,
         8.4000e+02],
        [1.3880e+01, 5.0400e+00, 2.2300e+00, 2.0000e+01, 8.0000e+01, 9.8000e-01,
         3.4000e-01, 4.0000e-01, 6.8000e-01, 4.9000e+00, 5.8000e-01, 1.3300e+00,
         4.1500e+02],
        [1.2040e+01, 4.3000e+00, 2.3800e+00, 2.2000e+01, 8.0000e+01, 2.1000e+00,
         1.7500e+00, 4.2000e-01, 1.3500e+00, 2.6000e+00, 7.9000e-01, 2.5700e+00,
         5.8000e+02],
        [1.4390e+01, 1.8700e+00, 2.4500e+00, 1.4600e+01, 9.6000e+01, 2.5000e+00,
         2.5200e+00, 3.0000e-01, 1.9800e+00, 5.2500e+00, 1.0200e+00, 3.5800e+00,
         1.2900e+03],
        [1.1650e+01, 1.6700e+00, 2.6200e+00, 2.6000e+01, 8.8000e+01, 1.9200e+00,
         1.6100e+00, 4.0000e-01, 1.3400e+00, 2.6000e+00, 1.3600e+00, 3.2100e+00,
         5.6200e+02]]) tensor([2., 2., 1., 0., 1.])
Tensor : 5x5
Shapes : torch.Size([5, 13]) torch.Si

In [0]:

# demo training loop
# num_iter = total_sample / batch_size
num_epochs = 2
batch_sz = 5 # One of the important hyper parameter.

num_iterations = math.ceil(len(train_dataset)/batch_sz)

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        
        if (i+1) % 10 == 0:
            print('Epoch: {}/{}, Step {}/{}'.format(epoch+1, num_epochs, i+1, num_iterations))


## Customer dataset:

With following directory strcture: 

```
 dataset --|
            train --|
                    |- Real --|
                              | ...
                    |- Fake --|
                              | ...
            test -- |
                    |- Real --|
                              | ...
                    |- Fake --|
                              | ...
```


In [0]:
class CustomDataset(Dataset):

    def __init__(self, root, train=True, transform=None):
        # load data and extract the required inputs.
        if train:
            self.path = os.path.join(root, "train")
        else:
            self.path = os.path.join(root, "test")
            
        self.transform = transform
        dirs = os.listdir(self.path)
        dirs.sort()
        self.dirs_full_path = [os.path.join(self.path, dir) for dir in dirs]
        self.images = []
        self.labels = []
        for p in self.dirs_full_path:
            self.images += [os.path.join(p, im) for im in os.listdir(p) if (os.path.splitext(im)[1].lower() == '.jpg') or 
                                                                           (os.path.splitext(im)[1].lower() == '.jpeg') or 
                                                                           (os.path.splitext(im)[1].lower() == '.png')]
            self.labels += [dirs.index(os.path.split(p)[1]) for im in os.listdir(p)]
            
        print('img_full_pathname {}'.format(self.images))
        print('labels {}'.format(self.labels))

        self.n_samples = 0
        # sanity checks 
        if len(self.images) == len(self.labels):
            print('Data intigrity is Good')
            self.n_samples = len(self.images)
        else:
            print('Data intigrity failed')
    

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        image = io.imread(self.images[index])
        y_data = torch.tensor(self.labels[index], dtype=torch.float32)

        if self.transform:
            image = self.transform(image)
        
        return image, y_data

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples



In [0]:
# Load Data 
train_dataset = FFTDataset(root='/content/dataset', train=True, transform=transforms.ToTensor())

test_dataset = FFTDataset(root='/content/dataset', train=False, transform=transforms.ToTensor())


# DataLoader
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=4,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=4,
                                          shuffle=False)