In [None]:
# Problems with out previous approach:
# No standard interface for data
# No easy way to apply transformations
# Shuffling and Sampling
# Batch Management and Parallelization

In [None]:
from sklearn.datasets import make_classification
import torch

In [None]:
# Step 1: Create a synthetic classification dataset using sklearn
X, y = make_classification(
    n_samples = 10,  # Number of samples
    n_features = 2,  # Number of features
    n_informative = 2,  # Number of informative features
    n_redundant = 0,  # Number of redundant features
    n_classes = 2,  # Number of classes
    random_state = 42  # For responsibility
)

In [None]:
X

array([[ 1.06833894, -0.97007347],
       [-1.14021544, -0.83879234],
       [-2.8953973 ,  1.97686236],
       [-0.72063436, -0.96059253],
       [-1.96287438, -0.99225135],
       [-0.9382051 , -0.54304815],
       [ 1.72725924, -1.18582677],
       [ 1.77736657,  1.51157598],
       [ 1.89969252,  0.83444483],
       [-0.58723065, -1.97171753]])

In [None]:
X.shape

(10, 2)

In [None]:
y

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [None]:
y.shape

(10,)

In [None]:
# Convert the data to Pytorch tensors
X = torch.tensor(X, dtype = torch.float32)
y = torch.tensor(y, dtype = torch.long)


In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):

    self.features = features
    self.labels = labels

  def __len__(self):

    return self.features.shape[0]

  def __getitem__(self, index):

    return self.features[index], self.labels[index]


In [None]:
dataset = CustomDataset(X, y)

In [None]:
len(dataset)

10

In [None]:
dataset[0]

(tensor([ 1.0683, -0.9701]), tensor(1))

In [None]:
dataset[1]

(tensor([-1.1402, -0.8388]), tensor(0))

In [None]:
dataloader = DataLoader(dataset, batch_size = 2, shuffle = True)

In [None]:
for batch_features, batch_labels in dataloader:

  print(batch_features)
  print(batch_labels)
  print("-"*50)


tensor([[ 1.7273, -1.1858],
        [ 1.0683, -0.9701]])
tensor([1, 1])
--------------------------------------------------
tensor([[ 1.8997,  0.8344],
        [-1.9629, -0.9923]])
tensor([1, 0])
--------------------------------------------------
tensor([[-2.8954,  1.9769],
        [-0.7206, -0.9606]])
tensor([0, 0])
--------------------------------------------------
tensor([[-1.1402, -0.8388],
        [-0.9382, -0.5430]])
tensor([0, 1])
--------------------------------------------------
tensor([[ 1.7774,  1.5116],
        [-0.5872, -1.9717]])
tensor([1, 0])
--------------------------------------------------


In [None]:
# parallelisation
# workers ( number can be increased ) can be implemented so that batches can be formed parallely

In [None]:
# Samplers are used to shuffle the data, two of them are sequential sampler (shuffle = False)
# and Random sampler
# when we an imbalanced dataset, 0 - 1 percent data, 1 - 99 percent data, therefore we need to
# build a custom sampler so that every batch has 1 percent data from the 0 class


In [None]:
# collate function : used to combines data from a number of rows (specifies how to combine a list of samples)

In [None]:
# DataLoader Important Parameters:
# 1. dataset
# 2. batch_size
# 3. shuffle
# 4. num_workers
# 5. pin_memory: if true GPU kicks in
# 6. drop_last: 32 rows, batch size = 10, so no.of batches is 3, now if you want to keep the last batch or not
# 7. collate_fn: A callable that processes a lost of samples into a batch
# 8. sampler_fn: defines the strategy for drawing samples