# Dataset to HuggingFace Hub for InsectSAM training

In [1]:
from datasets import Dataset, DatasetDict, Features, Image, load_dataset
import os
from PIL import Image as PILImage
import io

data_dir = "/Users/martintomov/Desktop/dataset"

# Helper function to encode image files as RGB
def encode_image(image_path):
    with open(image_path, 'rb') as image_file:
        image = PILImage.open(image_file)
        image = image.convert("RGB") 
        byte_io = io.BytesIO()
        image.save(byte_io, 'PNG')
        return byte_io.getvalue()

# Helper function to encode label files without changing their color mode
def encode_label(label_path):
    with open(label_path, 'rb') as label_file:
        label = PILImage.open(label_file)
        byte_io = io.BytesIO()
        label.save(byte_io, 'PNG')
        return byte_io.getvalue()

image_files = sorted([os.path.join(data_dir, 'image', file) for file in os.listdir(os.path.join(data_dir, 'image')) if file.endswith('.png')])
label_files = sorted([os.path.join(data_dir, 'label', file) for file in os.listdir(os.path.join(data_dir, 'label')) if file.endswith('.png')])

assert len(image_files) == len(label_files), "The number of images and labels should be the same"

data = []
for image_path, label_path in zip(image_files, label_files):
    data.append({
        'image': encode_image(image_path),
        'label': encode_label(label_path)
    })

features = Features({'image': Image(), 'label': Image()})

# Create a Dataset object
dataset = Dataset.from_dict({'image': [item['image'] for item in data], 'label': [item['label'] for item in data]}, features=features)

# Convert to a DatasetDict
dataset = DatasetDict({'train': dataset})

# Authenticate with Hugging Face and push the dataset
dataset.push_to_hub("martintmv/rb-ibdm")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/733 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/313 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/martintmv/rb-ibdm/commit/63c596331d34f96707ff34d347ff8c237b210e1e', commit_message='Upload dataset', commit_description='', oid='63c596331d34f96707ff34d347ff8c237b210e1e', pr_url=None, pr_revision=None, pr_num=None)