In my previous [blog-post](/posts/2024-10-24-data-loading-daft/) I introduced Daft. Now I wish to share recipes on how to do some things.

## Loading From HuggingFace Datasets

HuggingFace Datasets is one of the biggest dataset providers out there, integrating with them is something that is of great importance. Luckily it's easy!

In [9]:
!pip install -U daft

Collecting daft
  Downloading daft-0.1.2-py3-none-any.whl (11 kB)
Installing collected packages: daft
Successfully installed daft-0.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from dataclasses import dataclass
import datasets
import daft

daft.set_execution_config(enable_native_executor=True, default_morsel_size=256)


@dataclass
class DaftHFDatasetWrapper:
    _ds: datasets.Dataset
    df_train: daft.DataFrame
    df_val: daft.DataFrame

def load_from_hf_dataset() -> DaftHFDatasetWrapper:
    ds = datasets.load_dataset("detection-datasets/fashionpedia")
    df_train = daft.from_arrow(ds["train"].data.table[:1000])
    df_val = daft.from_arrow(ds["val"].data.table[:1000])

    return DaftHFDatasetWrapper(ds, df_train, df_val)

daft_ds = load_from_hf_dataset()
# custom to fashionpedia
fashionpedia_num_classes = daft_ds._ds["train"].features["objects"].feature["category"].num_classes

daft_ds.df_train.limit(2).to_pandas()   # pretty-print

Unnamed: 0,image_id,image,width,height,objects
0,23,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,682,1024,"{'bbox_id': [150311, 150312, 150313, 150314], ..."
1,25,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,683,1024,"{'bbox_id': [158953, 158954, 158955, 158956, 1..."


With our DataFrame's ready we can start to load the data and train a model.

To keep things simple we'll use a off-the-shelf model to do Object Detection.

In [5]:
#| code-fold: true
import lightning as L

from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.transforms.functional import convert_image_dtype
from torch.optim import AdamW
import torch

class SimpleModel(L.LightningModule):
    def __init__(self, num_classes: int):
        super().__init__()
        self.model = fasterrcnn_resnet50_fpn_v2(num_classes=num_classes)

    def forward(self, images, targets=None):
        """
        Forward method for training and inference.
        - During training, provide `targets` for loss computation.
        - During inference, `targets` should be None for predictions.
        """
        if targets:
            return self.model(images, targets)
        else:
            return self.model(images)

    def training_step(self, batch, batch_idx):
        """
        Training step to compute loss.
        """
        images, targets = batch
        images = [convert_image_dtype(img, dtype=torch.float) for img in images]
        loss_dict = self.model(images, targets)
        loss = sum(loss for loss in loss_dict.values())

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step for loss computation or other metrics.
        """
        images, targets = batch
        images = [convert_image_dtype(img, dtype=torch.float) for img in images]
        loss_dict = self.model(images, targets)
        val_loss = sum(loss for loss in loss_dict.values())

        self.log("val_loss", val_loss, prog_bar=True, logger=True)
        return val_loss

    def configure_optimizers(self):
        """
        Configures optimizer and optionally a learning rate scheduler.
        """
        optimizer = AdamW(self.model.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
        return [optimizer], [scheduler]

### Producing an Image from HF Datasets

To go from their "image" colums we need to decode the image. It's quite simply and the recipe below achieve this!

In [6]:
#| fig-cap: "A Daft DataFrame, nifty as it shows images in notebooks!"
hf_img_to_daft_img = daft.col("image").struct.get("bytes").image.decode()

daft_ds.df_train = daft_ds.df_train.with_column(
    "image", hf_img_to_daft_img
)
daft_ds.df_train.show(2)

image_id Int64,image Image[MIXED],width Int64,height Int64,"objects Struct[bbox_id: List[Int64], category: List[Int64], bbox: List[FixedSizeList[Float64; 4]], area: List[Int64]]"
23,,682,1024,"{bbox_id: [150311, 150312, 150313, 150314], category: [23, 23, 33, 10], bbox: [[445, 910, 505, 983], [239, 940, 284, 994], [298, 282, 386, 352], [210, 282, 448, 665]], area: [1422, 843, 373, 56375], }"
25,,683,1024,"{bbox_id: [158953, 158954, 158955, 158956, 158957, 158958, 158959, 158960, 158961, 158962], category: [2, 33, 31, 31, 13, 7, 22, 22, 23, 23], bbox: [[182, 220, 472, 647], [294, 221, 407, 257], [405, 297, 472, 647], [182, 264, 266, 621], [284, 135, 372, 169], [238, 537, 414, 606], [351, 732, 417, 922], [202, 749, 270, 930], [200, 921, 256, 979], [373, 903, 455, 966]], area: [87267, 1220, 16895, 18541, 1468, 9360, 8629, 8270, 2717, 3121], }"


In [7]:
daft_ds.df_train.select("objects").show(2)

"objects Struct[bbox_id: List[Int64], category: List[Int64], bbox: List[FixedSizeList[Float64; 4]], area: List[Int64]]"
"{bbox_id: [150311, 150312, 150313, 150314], category: [23, 23, 33, 10], bbox: [[445, 910, 505, 983], [239, 940, 284, 994], [298, 282, 386, 352], [210, 282, 448, 665]], area: [1422, 843, 373, 56375], }"
"{bbox_id: [158953, 158954, 158955, 158956, 158957, 158958, 158959, 158960, 158961, 158962], category: [2, 33, 31, 31, 13, 7, 22, 22, 23, 23], bbox: [[182, 220, 472, 647], [294, 221, 407, 257], [405, 297, 472, 647], [182, 264, 266, 621], [284, 135, 372, 169], [238, 537, 414, 606], [351, 732, 417, 922], [202, 749, 270, 930], [200, 921, 256, 979], [373, 903, 455, 966]], area: [87267, 1220, 16895, 18541, 1468, 9360, 8629, 8270, 2717, 3121], }"


### Transforming and working with images

Daft has some unique syntax for simple operations like `resize`.  
To do the standard transforms we have to move into the python domain, and that's done by utilizing `np.array`'s.

In [8]:
def apply_torch_transform(bboxes):
    bboxes = torch.tensor(bboxes, dtype=torch.float32)
    return bboxes

daft_ds.df_train = daft_ds.df_train.with_column("bboxes", daft.col("objects").struct.get("bbox").apply(apply_torch_transform, return_dtype=daft.DataType.tensor(daft.DataType.float32())))
daft_ds.df_train.show(2)

image_id Int64,image Image[MIXED],width Int64,height Int64,"objects Struct[bbox_id: List[Int64], category: List[Int64], bbox: List[FixedSizeList[Float64; 4]], area: List[Int64]]",bboxes Tensor(Float32)
23,,682,1024,"{bbox_id: [150311, 150312, 150313, 150314], category: [23, 23, 33, 10], bbox: [[445, 910, 505, 983], [239, 940, 284, 994], [298, 282, 386, 352], [210, 282, 448, 665]], area: [1422, 843, 373, 56375], }","<Tensor shape=(4, 4)>"
25,,683,1024,"{bbox_id: [158953, 158954, 158955, 158956, 158957, 158958, 158959, 158960, 158961, 158962], category: [2, 33, 31, 31, 13, 7, 22, 22, 23, 23], bbox: [[182, 220, 472, 647], [294, 221, 407, 257], [405, 297, 472, 647], [182, 264, 266, 621], [284, 135, 372, 169], [238, 537, 414, 606], [351, 732, 417, 922], [202, 749, 270, 930], [200, 921, 256, 979], [373, 903, 455, 966]], area: [87267, 1220, 16895, 18541, 1468, 9360, 8629, 8270, 2717, 3121], }","<Tensor shape=(10, 4)>"


In [None]:
import numpy as np
from torchvision.transforms import v2 as T

img_to_tensor = daft.col("image").cast(daft.DataType.tensor(dtype=daft.DataType.uint8()))

transforms = T.Compose([
    T.ToTensor(),
    T.RandomResizedCrop(size=(224, 224), antialias=True),
    T.RandomHorizontalFlip(p=0.5),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

@daft.udf(return_dtype=daft.DataType.tensor(daft.DataType.float32()))
def transform(image, bbox):
    return transforms(image), transforms(bbox)


def transform_images(df: daft.DataFrame) -> daft.DataFrame:
    df = df.with_columns({
        "image": img_to_tensor.apply(lambda x: transforms(x), return_dtype=daft.DataType.tensor(daft.DataType.float32())),
        "bboxes": daft.col("bboxes").apply(lambda x: transforms(x), return_dtype=daft.DataType.tensor(daft.DataType.float32()))
        })
    return df




In [7]:
daft_ds.df_train = transform_images(daft_ds.df_train)

In [None]:
import torchvision

df = daft_ds.df_train.limit(2).collect()
df

image_id Int64,image Tensor(Float32),width Int64,height Int64,"objects Struct[bbox_id: List[Int64], category: List[Int64], bbox: List[FixedSizeList[Float64; 4]], area: List[Int64]]",bboxes Tensor(Float32)
23,"<Tensor shape=(3, 224, 224)>",682,1024,"{bbox_id: [150311, 150312, 150313, 150314], category: [23, 23, 33, 10], bbox: [[445, 910, 505, 983], [239, 940, 284, 994], [298, 282, 386, 352], [210, 282, 448, 665]], area: [1422, 843, 373, 56375], }","<Tensor shape=(3, 224, 224)>"
25,"<Tensor shape=(3, 224, 224)>",683,1024,"{bbox_id: [158953, 158954, 158955, 158956, 158957, 158958, 158959, 158960, 158961, 158962], category: [2, 33, 31, 31, 13, 7, 22, 22, 23, 23], bbox: [[182, 220, 472, 647], [294, 221, 407, 257], [405, 297, 472, 647], [182, 264, 266, 621], [284, 135, 372, 169], [238, 537, 414, 606], [351, 732, 417, 922], [202, 749, 270, 930], [200, 921, 256, 979], [373, 903, 455, 966]], area: [87267, 1220, 16895, 18541, 1468, 9360, 8629, 8270, 2717, 3121], }","<Tensor shape=(3, 224, 224)>"


In [None]:
torch.as_tensor(df.loc[0, "image"]["data"]).shape

torch.Size([2095104])

In [None]:
torch.as_tensor(df.loc[0, "bboxes"]).shape

torch.Size([4, 4])

In [None]:
torchvision.utils.draw_bounding_boxes(torch.as_tensor(df.loc[0, "image"]["data"]), torch.as_tensor(df.loc[0, "bboxes"]))

ValueError: Pass individual images, not batches

### To PyTorch