<a href="https://colab.research.google.com/github/pankajr141/experiments/blob/master/Reasoning/deeplearning/custom_dataloader_in_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objective
The input data will be stored in multiple pickle files. 

<pre>
x_0.pkl  - this file contains X input sample of 1st pickle
y_0.pkl  - this file contains Y output sample of 1st pickle
</pre>

Each File will have  <br>
<pre>
N, T, F 
N - Number of sample
T - number of timesteps per sample
F - number of feature per sample
</pre>

<p>
We will create 2 dataloader 1 for train and 1 for test such that Each dataloader will load data from X and Y pickle in a way, that we notice no difference if all the dataset is in memory.
Meaning when pulling a batch of size 8, each sample can come from different pickle.

This solution ensures that very large dataset can be loaded like normal dataset if we have enough diskspace, however time required for each passthrough will significantly increase depending on IO speed of underlying hardware.
</p>

Only the mapping index which map location of index to pickle file will be kept in memory

### Dataset
Lets create K number of pickle each containing N number of records

In [45]:
K = 53     # number of pickle files
N = 2000   # number of records per pickle
Tx = 11    # time dimension for X 
Ty = 3     # time dimension for Y
F = 6      # number of features

In [46]:
import os
import pickle
import shutil
import numpy as np

# Create directory where pickle files will be stored
dataset_dir = "dataset"
shutil.rmtree(dataset_dir)
os.makedirs(dataset_dir)

# Loop over each pickle file
for i in range(K):
  data_x = np.random.randint(1, 100, size=(N, Tx, F))
  data_y = np.random.randint(1, 100, size=(N, Ty, F))

  # pickle file for X dataset
  pickle_file = os.path.join(dataset_dir, f"x_{i}.pkl") 
  with open(pickle_file, 'wb') as f:
    pickle.dump(data_x, f)

  # pickle file for Y dataset
  pickle_file = os.path.join(dataset_dir, f"y_{i}.pkl") 
  with open(pickle_file, 'wb') as f:
    pickle.dump(data_y, f)

now since the dataset in saved in dataset_dir let check the files

In [47]:
picklefiles = [os.path.join(dataset_dir, x) for x in os.listdir(dataset_dir)]x

# Lets filter Y files from our pickle dataset, for now we only need X files
picklefiles = list(filter(lambda x: "x_" in x, picklefiles))
print(picklefiles)

['dataset/x_47.pkl', 'dataset/x_5.pkl', 'dataset/x_22.pkl', 'dataset/x_20.pkl', 'dataset/x_42.pkl', 'dataset/x_39.pkl', 'dataset/x_46.pkl', 'dataset/x_25.pkl', 'dataset/x_34.pkl', 'dataset/x_29.pkl', 'dataset/x_11.pkl', 'dataset/x_23.pkl', 'dataset/x_18.pkl', 'dataset/x_50.pkl', 'dataset/x_24.pkl', 'dataset/x_45.pkl', 'dataset/x_10.pkl', 'dataset/x_7.pkl', 'dataset/x_2.pkl', 'dataset/x_33.pkl', 'dataset/x_26.pkl', 'dataset/x_19.pkl', 'dataset/x_40.pkl', 'dataset/x_13.pkl', 'dataset/x_27.pkl', 'dataset/x_43.pkl', 'dataset/x_30.pkl', 'dataset/x_3.pkl', 'dataset/x_21.pkl', 'dataset/x_41.pkl', 'dataset/x_8.pkl', 'dataset/x_49.pkl', 'dataset/x_51.pkl', 'dataset/x_14.pkl', 'dataset/x_28.pkl', 'dataset/x_6.pkl', 'dataset/x_31.pkl', 'dataset/x_35.pkl', 'dataset/x_9.pkl', 'dataset/x_48.pkl', 'dataset/x_0.pkl', 'dataset/x_1.pkl', 'dataset/x_44.pkl', 'dataset/x_32.pkl', 'dataset/x_4.pkl', 'dataset/x_15.pkl', 'dataset/x_52.pkl', 'dataset/x_38.pkl', 'dataset/x_16.pkl', 'dataset/x_12.pkl', 'dataset/

### Custom Pytorch Dataset

In [48]:
from torch.utils.data import Dataset, DataLoader
class CustomMultiPickleDataset(Dataset):
    def __init__(self, pickle_files, train=True, testsize=0.1):
        """
        params:
          pickle_files: List containing all pickle filepath on which dataset need to shuffle on
          train: Whether dataset is pointing to train or test dataset
          testsize: test dataset size
        """
        self.pickle_files = pickle_files
        self.train = train
        self.testsize = testsize

        print(f"Total pickle files : {len(self.pickle_files)}")

        # Map containing virtual Dataset index to actual Location in pickle file.    
        # Eg trainidx_map = {1: [2, 4]}  # meaning 1st index in training dataset is present in 4th index of 2nd pickle file

        def _generate_train_indexes():
            self.map_trainidx = {}
            train_cntr = 0
            for i, pickle_file in enumerate(self.pickle_files):
                pk = pickle.load(open(pickle_file, 'rb'))

                # Generating IDX map for train 
                n_samples_train = int(pk.shape[0] * (1 - testsize))            
                for k in range(train_cntr, train_cntr + n_samples_train):
                    self.map_trainidx[k] = [i, k - train_cntr]  # pickle file index and location in that pickle file
                train_cntr = train_cntr + n_samples_train
            self.total_samples_train = len(self.map_trainidx.keys())
            print(f"Total train - samples {self.total_samples_train}")

        def _generate_test_indexes():
            self.map_testidx = {}

            test_cntr = 0
            for i, pickle_file in enumerate(self.pickle_files):
                pk = pickle.load(open(pickle_file, 'rb'))

                # Generating IDX map for test
                n_samples_train = int(pk.shape[0] * (1 - testsize))            
                n_samples_test = pk.shape[0] - n_samples_train        
                for k in range(test_cntr, test_cntr + n_samples_test):
                    self.map_testidx[k] = [i,  k - test_cntr + n_samples_test]
                test_cntr = test_cntr + n_samples_test

            self.total_samples_test = len(self.map_testidx.keys())
            print(f"Total validation - samples {self.total_samples_test}")

        if train:
            _generate_train_indexes()
            return
        _generate_test_indexes()


    def __len__(self):
        if self.train:
            return self.total_samples_train
        return self.total_samples_test
        
    def __getitem__(self, idx):
        idx_map = self.map_trainidx if self.train else self.map_testidx
        
        # For each ID, we need to find corresonding X and Y pickle file
        x_filepath = self.pickle_files[idx_map[idx][0]]
        y_filepath = x_filepath.replace('x_', 'y_')
        rel_idx = idx_map[idx][1]

        # Load the pickle file and return the sample
        x = pickle.load(open(x_filepath, 'rb'))[rel_idx]
        y = pickle.load(open(y_filepath, 'rb'))[rel_idx]
        return x, y

#### Defining Params

In [49]:
batch_size = 8
n_parallel = 4
testsize = 0.2

#### Loading training dataset

In [50]:
dataloader_train = DataLoader(CustomMultiPickleDataset(picklefiles, train=True, testsize=testsize), 
                          batch_size=batch_size, 
                          num_workers=n_parallel, 
                          shuffle=True, 
                          pin_memory=False)

Total pickle files : 53
Total train - samples 84800


In [53]:
Epochs = 4
for i in range(Epochs):
  for j, data in enumerate(dataloader_train):
      x, y = data
      print(j, x.shape, y.shape)
      # Train some model
      break
  break



0 torch.Size([8, 11, 6]) torch.Size([8, 3, 6])


#### Loading test dataset

In [55]:
dataloader_test = DataLoader(CustomMultiPickleDataset(picklefiles, train=False, testsize=testsize), 
                          batch_size=batch_size, 
                          num_workers=n_parallel, 
                          shuffle=True, 
                          pin_memory=False)

Total pickle files : 53
Total validation - samples 21200




In [56]:
for j, data in enumerate(dataloader_test):
    x, y = data
    print(j, x.shape, y.shape)
    # Infer the result for testing
    break

0 torch.Size([8, 11, 6]) torch.Size([8, 3, 6])
