## Load Dataset locally with Huggingface Datasets 🤗

In [None]:
from datasets import load_dataset, Image, DatasetDict

# Load the metadata CSV file
dataset = load_dataset("csv", data_files="metadata.csv")

# Define the mapping of class names to indices (necessary for stratified splitting)
columns = {
    "bicycle": 0,
    "bus": 1,
    "car": 2,
    "crosswalk": 3,
    "hydrant": 4,
}

# Add the primary class label of each image (necessary for stratified splitting)
dataset = dataset.map(
    function=lambda x: {"class_idx": columns[x["image"].split("/")[2]]},
)
dataset = dataset.class_encode_column("class_idx")

# Add all class labels of each image
dataset = dataset.map(
    function=lambda x: {"labels": [x[c] for c in columns]},
    remove_columns=columns,
)

# Load the images from the specified directory
dataset = dataset.cast_column("image", Image())

# Split the complete dataset into train and val_and_test datasets
dataset = dataset["train"].train_test_split(
    test_size=0.2,
    seed=0,
    stratify_by_column="class_idx",
)
train_dataset, val_test_dataset = dataset["train"], dataset["test"]

# Split the val_and_test dataset into validation and test datasets
dataset = val_test_dataset.train_test_split(
    test_size=0.5,
    seed=0,
    stratify_by_column="class_idx",
)
val_dataset, test_dataset = dataset["train"], dataset["test"]

# Combine the datasets into a single dictionary
dataset = DatasetDict(
    {
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    }
)

# Remove the primary class label from the dataset
dataset = dataset.remove_columns(["class_idx"])

In [None]:
# Display the first image and its labels
dataset["train"][0]["image"], dataset["train"][0]["labels"]