# Test dataset generator
We will prepare tensor binary files from imagenette (valid) dataset for C runtime here.

In [80]:
from fastai.vision.all import *
import torch
import os
import random

from export import serialize_fp32

# Select number of images include to test set
max_images_per_class = 20

We will eventually generate raw tensor binary files for C runtime as a `test` dataset. Those files have a file extension `.bin`. The name of parent direcotries have been already encoded from `0` to `9` accordingly.

## Generating test set
Testset will be saved to huggingface, so you need to clone the repository.

In [130]:
!git clone https://huggingface.co/datasets/ninjalabo/imagenette2-320

fatal: destination path 'imagenette2-320' already exists and is not an empty directory.


In [131]:
# Load the Imagenette dataset
path = untar_data(URLs.IMAGENETTE_320,data=Path.cwd()/'data')
dls = ImageDataLoaders.from_folder(path, valid='val', item_tfms=Resize(224),
                                   batch_tfms=Normalize.from_stats(*imagenet_stats),)

# Initialize counters to track saved images per class
saved_counts = {str(i): 0 for i in range(10)}

for imgs, labels in dls.valid:
    for img, label in zip(imgs, labels):
        label = str(label.item())  # Convert label to string
        if saved_counts[label] < max_images_per_class:
            dst_dir = os.path.join("imagenette2-320/test", label)
            file_path = os.path.join(dst_dir, f'{saved_counts[label]}.bin')
            with open(file_path, "wb") as f:
                serialize_fp32(f, img)
            saved_counts[label] += 1
    # Stop if all classes have 10 images
    if all(count >= max_images_per_class for count in saved_counts.values()):
        break

## Test dataset directory structure

In [132]:
!tree -d data/imagenette2-320/test
!ls -al data//imagenette2-320/test/[2,6]/[3,7].bin

[01;34mdata/imagenette2-320/test[0m
├── [01;34m0[0m
├── [01;34m1[0m
├── [01;34m2[0m
├── [01;34m3[0m
├── [01;34m4[0m
├── [01;34m5[0m
├── [01;34m6[0m
├── [01;34m7[0m
├── [01;34m8[0m
└── [01;34m9[0m

11 directories
-rw-r--r--  1 harukadoyu  staff  602112 May 16 13:57 data//imagenette2-320/test/2/3.bin
-rw-r--r--  1 harukadoyu  staff  602112 May 16 13:58 data//imagenette2-320/test/2/7.bin
-rw-r--r--  1 harukadoyu  staff  602112 May 16 13:35 data//imagenette2-320/test/6/3.bin
-rw-r--r--  1 harukadoyu  staff  602112 May 16 13:36 data//imagenette2-320/test/6/7.bin


## Testing

In [139]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from train import load

class TensorDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.classes = sorted(os.listdir(root_dir))
        self.file_paths = []
        self.labels = []
        for label in self.classes:
            label_dir = os.path.join(root_dir, label)
            files = os.listdir(label_dir)
            for file in files:
                self.file_paths.append(os.path.join(label_dir, file))
                self.labels.append(int(label))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        with open(self.file_paths[idx], "rb") as f:
            nch, h, w = 3, 224, 224
            tensor = torch.tensor(struct.unpack("f"*nch*h*w, f.read())).view(nch,h,w)
        label = self.labels[idx]
        return tensor, label

In [135]:
learn = load("resnet18")
learn.dls = DataLoaders(DataLoader([]), test_dl)
test_dl = DataLoader(TensorDataset("imagenette2-320/test/"), batch_size=32, num_workers=10)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [136]:
%%time
learn.model.cpu()
learn.validate(dl=test_dl)

CPU times: user 61.5 ms, sys: 100 ms, total: 162 ms
Wall time: 866 ms


(#2) [0.07589495182037354,0.9700000286102295]

## Upload updated dataset to HuggingFace

In [138]:
# Uncomment this to update test data set
#!cd imagenette2-320/ & git add test & git commit -m "update test set" & git push