# Dataset & Dataloader

## `torch.utils.data.Dataset` : Dataset's role

### 1. Read & Split

1. Read dataset
2. Shuffle before split into train/valid/test set
3. Split dataset into train/valid/test set


### 2. Preprocessing

1. Remove unnecessary rows
    - ex)high null ratio
    
2. Standardization/Scaling training set
    - Standard
    - Min/Max Scaling
    - No need to apply to valid/test set
 
## `torch.utils.data.DataLoader` : Dataloader's role
 
### 3. Iterator

1. Shuffle for each epoch
2. Get tensor chunk with mini-batch size
3. Yield mini-batch for each iteration

* * * 

# Load dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target

df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


# Dataset & DataLoader

### `torch.utils.data.Dataset`

```
class CustomDataset(Dataset):
    
    def __init__(self, *args, **kwargs):
        ...
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
```



### `torch.utils.data.DataLoader`

```
dataloader = DataLoader(
    dataset=CustomDataset(x[0], y[0]),
    batch_size=batch_size,
    shuffle=True,
)
```

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
from torch.utils.data import Dataset, DataLoader

In [6]:
class CustomDataset(Dataset):
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
        super().__init__()
        
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [7]:
# Convert data to torch tensor
data = torch.from_numpy(df.values).float()

# Split to data and label
x = data[:, :10]
y = data[:, -1:]

In [8]:
# Define train / validation / test set ratio
ratios = [.6, .2, .2]

data_size = data.shape[0]
train_count = int(data_size * ratios[0])
valid_count = int(data_size * ratios[1])
test_count = data_size - train_count - valid_count

counts = [train_count, valid_count, test_count]

In [10]:
# Split data

# random permutation
indices = torch.randperm(data_size)

# shuffle based on random permutation
x = torch.index_select(x, dim=0, index=indices)
y = torch.index_select(y, dim=0, index=indices)

x = x.split(counts, dim=0)
y = y.split(counts, dim=0)


# x[0], y[0] training set
# x[1], y[1] validation set
# x[2], y[2] test set

In [11]:
batch_size = 128

train_loader = DataLoader(
    dataset=CustomDataset(x[0], y[0]),
    batch_size=batch_size,
    shuffle=True, # Allow shuffling only for training set.
)

valid_loader = DataLoader(
    dataset=CustomDataset(x[1], y[1]),
    batch_size=batch_size,
    shuffle=False,
)

test_loader = DataLoader(
    dataset=CustomDataset(x[2], y[2]),
    batch_size=batch_size,
    shuffle=False,
)

### get data from dataloader from iteration

```
for x_i, y_i in train_loader:
    print(x_i)
    print(y_i)
```

In [None]:
next(iter(train_loader))

# Reference

- Fast Campus NLP Online Course

In [12]:
# Reference

- Fast Campus NLP Online Course

[tensor([[9.5670e+00, 1.5910e+01, 6.0210e+01,  ..., 1.6670e-02, 1.5510e-01,
          6.4030e-02],
         [9.8760e+00, 1.9400e+01, 6.3950e+01,  ..., 3.0290e-02, 1.9450e-01,
          6.3220e-02],
         [1.1220e+01, 1.9860e+01, 7.1940e+01,  ..., 7.5830e-03, 1.9400e-01,
          6.0280e-02],
         ...,
         [1.1900e+01, 1.4650e+01, 7.8110e+01,  ..., 3.0030e-02, 1.9950e-01,
          7.8390e-02],
         [1.3740e+01, 1.7910e+01, 8.8120e+01,  ..., 1.3290e-02, 1.4730e-01,
          5.5800e-02],
         [9.0000e+00, 1.4400e+01, 5.6360e+01,  ..., 3.4720e-03, 1.7880e-01,
          6.8330e-02]]),
 tensor([[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [0.],
         [0.],
         [0.],
         [1.],
         [1.],
         [1.],
         [0.],
         [1.],
         [1.],
         [0.],
         [1.],
         [0.],
         [1.],
         [1.],
         [0.],
         [0.],
         [1.],
         [1.],
         [1.],
         [1.],
         [0.],


# Reference

- Fast Campus NLP Online Course