# Basic Implementation of Dataset and Dataloader

# Import libraries

In [1]:
import pandas as pd
import numpy as np

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [4]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create Dataset

In [7]:
X, y = make_classification(
    n_samples=10,
    n_features=2,
    n_informative=2,
    n_redundant=0
)

X, y

(array([[-1.00757894, -0.84268432],
        [ 1.69839901,  0.78453756],
        [-2.32706971,  0.38155984],
        [-1.082216  ,  0.15516432],
        [-1.47250465, -0.9652876 ],
        [ 1.17779162, -0.23623751],
        [ 0.95861811,  2.26886267],
        [-1.92995075, -0.5948704 ],
        [ 0.99917262,  0.58879007],
        [ 0.45917189,  1.90741499]]),
 array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1]))

# train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)
X_train.shape, X_test.shape

((8, 2), (2, 2))

# Make `Numpy` array to `Tensor`

In [10]:
# Features
X_train_tensor = torch.from_numpy(X_train).float()
X_test_tensor = torch.from_numpy(X_test).float()

# Labels
y_train_tensor = torch.from_numpy(y_train).float()
y_test_tensor = torch.from_numpy(y_test).float()

# `Dataclass`

In [13]:
class MyCustomDatasetClass(Dataset):
  def __init__(self, features, labels):
    # Store Dataset information
    self.features = features
    self.labels = labels

  def __len__(self) -> int:
    return len(self.labels)

  def __getitem__(self, index) -> tuple:

    # if it needs any transformation then do here
    """
    Do a cute transormation on dataset
    it could be help on image/text data
    """

    index_feature = self.features[index]
    index_label = self.labels[index]

    return index_feature, index_label

In [14]:
dataset = MyCustomDatasetClass(features=X_train_tensor, labels=y_train_tensor)
dataset

<__main__.MyCustomDatasetClass at 0x7e9b2c439dc0>

In [15]:
len(dataset)

8

In [19]:
display(dataset[2][0])
display(X_train_tensor[2])

tensor([-1.4725, -0.9653])

tensor([-1.4725, -0.9653])

# `DataLoader`

In [20]:
dataloader = DataLoader(
    dataset=dataset,
    batch_size=2,
    shuffle=True
  )

In [23]:
for feature, label in dataloader:
  display(feature)
  display(label)
  print('-'*40)

tensor([[ 1.6984,  0.7845],
        [-1.4725, -0.9653]])

tensor([1., 0.])

----------------------------------------


tensor([[0.9586, 2.2689],
        [0.4592, 1.9074]])

tensor([0., 1.])

----------------------------------------


tensor([[-1.0076, -0.8427],
        [-1.0822,  0.1552]])

tensor([0., 0.])

----------------------------------------


tensor([[ 0.9992,  0.5888],
        [ 1.1778, -0.2362]])

tensor([1., 1.])

----------------------------------------
