### Data Loader and Batches

For every NN in MLP this is the very begining step to load and create batches from the available data.  
Interms of Machine learning it is refeered to as Data preprocessing and splitting.

#### Scenario-1: Data is in CSV format  
`For multiclass classification and regression problems/data`

In [None]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:

# Step-1 loading data from the csv file. and splitting it
def load_data(pth):

    data = pd.read_csv(pth)

    data.drop(['date'], axis = 1, inplace = True) ## Feature Engineering, droping columns, splitting columns and replacing

    y = data['DAX'].values # Take target column
    x = data.drop(['DAX'], axis = 1).values # Take feature column

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

    x_train = T.tensor(x_train.astype(np.float32)) # numpy array to Tensors to train MLP.
    x_test = T.tensor(x_test.astype(np.float32))

    y_train = T.tensor(y_train.astype(np.float32))
    y_test = T.tensor(y_test.astype(np.float32))

    return x_train, x_test, y_train, y_test

In [None]:
# Step-2 Create batches
def to_batches(x_train, x_test, y_train, y_test, batch_size):

    n_batches = x_train.shape[0] // batch_size # 11 / 3 = 3.66 -> 3
    n_batches_test = x_test.shape[0] // batch_size

    indexes = np.random.permutation(x_train.shape[0])
    indexes_test = np.random.permutation(x_test.shape[0])


    x_train = x_train[indexes]
    y_train = y_train[indexes]

    x_test = x_test[indexes_test]
    y_test = y_test[indexes_test]

    x_train = x_train[ :batch_size * n_batches ].reshape(n_batches, batch_size, x_train.shape[1])
    y_train = y_train[ :batch_size * n_batches ].reshape(n_batches, batch_size, 1)
    
    x_test = x_test[ :batch_size * n_batches_test ].reshape(n_batches_test, batch_size, x_test.shape[1])
    y_test = y_test[ :batch_size * n_batches_test ].reshape(n_batches_test, batch_size, 1)


    return x_train, x_test, y_train, y_test

x_train_batches, x_test_batches, y_train_batches, y_test_batches = to_batches(x_train, x_test, y_train, y_test, 10)

So, The top one is the way to create data batches for our neural networks.  
But, we hard coded a lot of things.  
There is a other way to do it.  
Using custom `Dataloader & DataLoader pytorch function` to create batches

In [None]:
## Creating Custom Dataloader for pytorch DataLoader to create Batches.
import torch as T
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):   # Dataset in CustomDataset is from torch.utils.data
    def __init__(self, data_path):
        data = pd.read_csv(data_path)
        data.columns = [x.replace('"','').replace(' ','') for x in data.columns] # to relace anything in the columns
        # Do the F.E like dropping, adding columns
        self.x = data.drop('target', axis=1).values
        self.y = data['target'].values
        self.n_samples = self.x.shape[0]

    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

pth= ''
data = CustomDataset(data_path=pth)
data = DataLoader(dataset=data, batch_size=10, shuffle=True)
trainset = data[0:20]
testset = data[20:] # Depends on no.of batches 
# this trainset contain features and labels

### Image data.  
What if we got data in the form of images.

If you need to load an image dataset, it's more convenient to use the `ImageFolder` class from the `torchvision.datasets` module.

To do so, you need to structure your data as follows:

```
root
|_class1
    |_xxx.png
|_class2
    |_xxx.png
```

that means that each class has its own directory.

By giving this structure, the name of the class will be taken by the name of the folder!

In [None]:
from torchvision import datasets, transforms
root_dir = r'C:\Users\rnr31\Documents\GitHub\Data_Science_2022\03.Deep Learning\04. DataLoader\image_data'

train_transforms = transforms.Compose([transforms.Resize(255),
                                       transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.5, 0.5, 0.5],
                                                            [0.5, 0.5, 0.5])])

test_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.5, 0.5, 0.5],
                                                           [0.5, 0.5, 0.5])])

# Pass transforms in here, then run the next cell to see how the transforms look
train_data = datasets.ImageFolder(root_dir + '/train', transform=train_transforms)
test_data = datasets.ImageFolder(root_dir + '/test', transform=test_transforms)

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
test_loader = DataLoader(test_data, batch_size=2, shuffle=True)
# Batch size depends on the count of total images

In [None]:
def imshow(image, ax=None, title=None, normalize=False):
    """Imshow for Tensor."""
    if ax is None:
        fig, ax = plt.subplots()
    image = image.numpy().transpose((1, 2, 0))

    if normalize:
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
        image = std * image + mean
        image = np.clip(image, 0, 1)

    ax.imshow(image)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='both', length=0)
    ax.set_xticklabels('')
    ax.set_yticklabels('')

    return ax

In [None]:
#YOUR CODE HERE

# Run this to test your data loaders
images, labels = next(iter(train_loader))
imshow(images[9], normalize=False)
labels[9]

# Loading Dataset from Pytorch  
Refer to MNIST.

### Image Data in CSV format
What if the path and labels of the images are available in the CSV format.

In [2]:
# For example if our CSV file looks like this
import pandas as pd
train = pd.DataFrame({"path": ["my_dataset/image1.png", "my_dataset/image2.png"], "label": [0, 1] })
train

Unnamed: 0,path,label
0,my_dataset/image1.png,0
1,my_dataset/image2.png,1


We can create custom dataloader with csv file and DataLoader to create batches.

In [None]:
from PIL import Image
class CustomDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        df = pd.read_csv(csv_file)
        self.paths = df.path.values
        self.labels = df.label.values
    def __getitem__(self, index):
        # we want to be index like dataset[index]
        # to get the index-th batch
        img = Image.open(self.paths[index]).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        return img, self.labels[index]
    
    def __len__(self):
        # to retrieve the total samples by doing len(dataset)
        return len(self.paths)