In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd

# Dataframe

## From folder table

In [2]:
from pathlib import Path

In [3]:
path = Path('/home/adriano/datasets/anomaly_detection/cans')
files = list(path.rglob('*.*'))

In [4]:
df = pd.DataFrame(dict(path=files))
df['label'] = df.path.apply(lambda x: x.parent.name)
df['split'] = df.path.apply(lambda x: x.parent.parent.name)
df['path'] = df['path'].astype(str)
df.tail()

Unnamed: 0,path,label,split
62,/home/adriano/datasets/anomaly_detection/cans/...,good,train
63,/home/adriano/datasets/anomaly_detection/cans/...,good,train
64,/home/adriano/datasets/anomaly_detection/cans/...,good,train
65,/home/adriano/datasets/anomaly_detection/cans/...,good,train
66,/home/adriano/datasets/anomaly_detection/cans/...,good,train


## HF dataset

In [14]:
# single dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['path', 'label', 'split'],
    num_rows: 67
})

In [15]:
# multiple datasets
datasets = DatasetDict({
    split:Dataset.from_pandas(df_split.reset_index(drop=True))
    for split, df_split
    in df.groupby('split')
    })
datasets

DatasetDict({
    test: Dataset({
        features: ['path', 'label', 'split'],
        num_rows: 52
    })
    train: Dataset({
        features: ['path', 'label', 'split'],
        num_rows: 15
    })
})

In [16]:
datasets['train'][0]

{'path': '/home/adriano/datasets/anomaly_detection/cans/train/good/00063.jpg',
 'label': 'good',
 'split': 'train'}

### Getting the imgs

In [21]:
from PIL import Image
def read_img(examples): 
    examples['img'] = [Image.open(p) for p in examples['path']]
    return examples

In [23]:
datasets = datasets.map(read_img, batched=True)
datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['path', 'label', 'split', 'img'],
        num_rows: 52
    })
    train: Dataset({
        features: ['path', 'label', 'split', 'img'],
        num_rows: 15
    })
})

In [24]:
datasets['train'][0]

{'path': '/home/adriano/datasets/anomaly_detection/cans/train/good/00063.jpg',
 'label': 'good',
 'split': 'train',
 'img': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7F1EA791A610>}

### map with Input size != output size

In [27]:
from torchvision.transforms import FiveCrop

crop = FiveCrop(300)

def five_crop(examples):
    crops = []
    labels = []
    for img, label in zip(examples['img'], examples['label']):
        crops.extend(crop(img))
        labels.extend([label] * 5)
    examples['img'] = crops
    examples['label'] = labels
    return examples

In [28]:
datasets = datasets.map(five_crop, batched=True, remove_columns=['path', 'split'], batch_size=8)
datasets

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['label', 'img'],
        num_rows: 260
    })
    train: Dataset({
        features: ['label', 'img'],
        num_rows: 75
    })
})

### img-level Augmentation

In [29]:
from torchvision.transforms import Compose, ColorJitter, ToTensor, Resize

tfms = Compose(
    [    Resize(224),
         ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.25),
         ToTensor(),
    ]
)

def transforms(examples):
    examples['img'] = [tfms(image) for image in examples['img']]
    return examples

In [30]:
## on-the-fly transform
datasets['train'].set_transform(transforms)

In [31]:
datasets['train'][0]

{'label': 'good',
 'img': tensor([[[0.1490, 0.1490, 0.1529,  ..., 0.1294, 0.1294, 0.1373],
          [0.1529, 0.1529, 0.1569,  ..., 0.1294, 0.1294, 0.1333],
          [0.1529, 0.1529, 0.1569,  ..., 0.1294, 0.1294, 0.1333],
          ...,
          [0.1137, 0.1137, 0.1137,  ..., 0.1922, 0.2314, 0.2784],
          [0.1137, 0.1137, 0.1137,  ..., 0.1294, 0.1373, 0.1451],
          [0.1137, 0.1137, 0.1137,  ..., 0.1176, 0.1176, 0.1216]],
 
         [[0.1490, 0.1490, 0.1529,  ..., 0.1294, 0.1294, 0.1373],
          [0.1529, 0.1529, 0.1569,  ..., 0.1294, 0.1294, 0.1333],
          [0.1529, 0.1529, 0.1569,  ..., 0.1294, 0.1294, 0.1333],
          ...,
          [0.1137, 0.1137, 0.1137,  ..., 0.1922, 0.2314, 0.2784],
          [0.1137, 0.1137, 0.1137,  ..., 0.1294, 0.1373, 0.1451],
          [0.1137, 0.1137, 0.1137,  ..., 0.1176, 0.1176, 0.1216]],
 
         [[0.1490, 0.1490, 0.1529,  ..., 0.1294, 0.1294, 0.1373],
          [0.1529, 0.1529, 0.1569,  ..., 0.1294, 0.1294, 0.1333],
          [0.15

In [32]:
datasets['test'][0]

{'label': 'good',
 'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=300x300 at 0x7F1E9F079B80>}

## From dataset to dataloader

In [33]:
from torch.utils.data import DataLoader

train_dl = DataLoader(datasets['train'], batch_size=8)

In [34]:
for batch in train_dl:
    break

In [35]:
batch

{'label': ['good', 'good', 'good', 'good', 'good', 'good', 'good', 'good'],
 'img': tensor([[[[0.1490, 0.1490, 0.1529,  ..., 0.1176, 0.1255, 0.1294],
           [0.1529, 0.1529, 0.1569,  ..., 0.1255, 0.1255, 0.1255],
           [0.1529, 0.1529, 0.1569,  ..., 0.1255, 0.1176, 0.1255],
           ...,
           [0.0941, 0.0941, 0.0941,  ..., 0.2118, 0.2706, 0.3373],
           [0.0941, 0.0941, 0.0941,  ..., 0.1255, 0.1294, 0.1451],
           [0.0941, 0.0941, 0.0941,  ..., 0.0980, 0.1059, 0.1098]],
 
          [[0.1490, 0.1490, 0.1529,  ..., 0.1176, 0.1255, 0.1294],
           [0.1529, 0.1529, 0.1569,  ..., 0.1255, 0.1255, 0.1255],
           [0.1529, 0.1529, 0.1569,  ..., 0.1255, 0.1176, 0.1255],
           ...,
           [0.0941, 0.0941, 0.0941,  ..., 0.2118, 0.2706, 0.3373],
           [0.0941, 0.0941, 0.0941,  ..., 0.1255, 0.1294, 0.1451],
           [0.0941, 0.0941, 0.0941,  ..., 0.0980, 0.1059, 0.1098]],
 
          [[0.1490, 0.1490, 0.1529,  ..., 0.1176, 0.1255, 0.1294],
        