In [1]:
# !pip install fiftyone

In [2]:
import os
import re
import shutil
from pathlib import Path

from tqdm import tqdm

# Convert Stanford Dogs Dataset

In [3]:
stanford_root = Path('../raw_data/stanford-dogs-dataset/')
target_dataset_path = Path('../data/stanford-dogs-dataset/')

In [4]:
print('Copy images')
target_images_dir = target_dataset_path / 'images'
src_images_dir = stanford_root / 'images'

os.makedirs(target_images_dir, exist_ok=True)

for temp_path in tqdm(src_images_dir.rglob('*.jpg')):
    dst_file_path = target_images_dir / temp_path.name
    shutil.copy(temp_path, dst_file_path)

Copy images


20580it [00:06, 3050.34it/s]


In [5]:
print('Copy labels')
target_labels_dir = target_dataset_path / 'labels'
src_labels_dir = stanford_root / 'annotations'

pattern = "n[0-9]{8}_[0-9]+"

os.makedirs(target_labels_dir, exist_ok=True)

for temp_path in tqdm(src_labels_dir.rglob('*')):
    if temp_path.name.startswith('n') and re.match(pattern, temp_path.name):
        dst_file_path = target_labels_dir / (temp_path.name + '.xml')
        shutil.copy(temp_path, dst_file_path)

Copy labels


20703it [00:04, 4738.98it/s]


In [6]:
import fiftyone as fo

# Load PascalVOC dataset
stanford_dogs_dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.VOCDetectionDataset,
    labels_path=target_labels_dir,
    data_path=target_images_dir,
    label_field="detections",
)

 100% |█████████████| 20580/20580 [14.1s elapsed, 0s remaining, 1.5K samples/s]      


In [7]:
stanford_labels = []

# Iterate through samples to collect the labels
for sample in stanford_dogs_dataset:
    stanford_labels.append(sample.detections.detections[0].label)

In [8]:
stanford_unique_labels = set(stanford_labels)
print(stanford_unique_labels)
print(len(stanford_unique_labels))

{'Tibetan_mastiff', 'Chesapeake_Bay_retriever', 'Airedale', 'Mexican_hairless', 'Labrador_retriever', 'bloodhound', 'Norfolk_terrier', 'kelpie', 'Siberian_husky', 'Border_terrier', 'silky_terrier', 'dhole', 'Chihuahua', 'pug', 'Irish_wolfhound', 'Yorkshire_terrier', 'vizsla', 'beagle', 'West_Highland_white_terrier', 'Lakeland_terrier', 'redbone', 'otterhound', 'Australian_terrier', 'soft-coated_wheaten_terrier', 'standard_poodle', 'Boston_bull', 'Irish_terrier', 'American_Staffordshire_terrier', 'Newfoundland', 'Brabancon_griffon', 'Eskimo_dog', 'miniature_schnauzer', 'keeshond', 'Afghan_hound', 'Maltese_dog', 'Leonberg', 'toy_terrier', 'toy_poodle', 'Dandie_Dinmont', 'Appenzeller', 'Old_English_sheepdog', 'Rottweiler', 'Saint_Bernard', 'Saluki', 'Welsh_springer_spaniel', 'komondor', 'Weimaraner', 'malamute', 'Kerry_blue_terrier', 'cocker_spaniel', 'bull_mastiff', 'malinois', 'curly-coated_retriever', 'bluetick', 'Pomeranian', 'basenji', 'Bernese_mountain_dog', 'golden_retriever', 'aff

# Convert computer-science-gtrnn/dog-breeds

In [9]:
roboflow_dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.COCODetectionDataset,
    labels_path='../raw_data/big-dog-breeds-ds/train/_annotations.coco.json',
    data_path='../raw_data/big-dog-breeds-ds/train/',
)

 100% |███████████████| 9949/9949 [6.2s elapsed, 0s remaining, 1.6K samples/s]      


In [10]:
roboflow_labels = []

# Iterate through samples to collect the labels
for sample in roboflow_dataset.iter_samples():
    if sample.detections:
        roboflow_labels.append(sample.detections.detections[0].label)

In [11]:
roboflow_unique_labels = set(roboflow_labels)
print(roboflow_unique_labels)
print(len(roboflow_unique_labels))

{'Chesapeake Bay Retriever', 'Greyhound', 'Basenji', 'Dalmatian', 'Collie', 'saluki', 'pug', 'Chihuahua', 'Belgian Laekenois', 'Clamber spaniel', 'Flat Coated Retriever', 'Bull Terrier', 'Jack Russell', 'rhodesian ridgeback', 'portuguese water dog', 'Alaskan Malamute', 'Lhasa Apso', 'Australian Shepherd', 'Bouvier De Flandres', 'Border Collie', 'Afghan Hound', 'Cocker spaniel', 'Japanese Spitz', 'Golden Retriever', 'Cairn Terrier', 'Scottish Terrier', 'pyrenean mountain dog', 'American Cocker Spaniel', 'Iris Terrier', 'German Sheperd', 'Irish setter', 'Fox hound', 'Boston Terrier', 'Shih Tzu', 'Bullmanstiff', 'Bedlington Terrier', 'Deerhound', 'Cavalier King Charles Spaniel', 'Rottwelier', 'Bloodhound', 'Dachshund', 'English Springer Spaniel', 'Lowchen', 'Irish Water Spaniel', 'Yorkipoo', 'Airedale Terrier', 'Pomeranian', 'Basset Fauve De Bretagne', 'saint bernard', 'Hungarian Vizsla', 'Schipperke', 'Shetland sheepdog', 'Basset Hound', 'Schnauzer', 'Fox terrier', 'Bearded Collie', 'Bel

## Merge datasets and export

In [12]:
merged_dataset_name = "merged_dataset"

if fo.dataset_exists(merged_dataset_name):
    fo.delete_datasets(merged_dataset_name)

merged_dataset = fo.Dataset(merged_dataset_name)

merged_dataset.merge_samples(roboflow_dataset)
merged_dataset.merge_samples(stanford_dogs_dataset)

In [13]:
all_labels = []

# Iterate through samples to collect the labels
for sample in merged_dataset.iter_samples():
    # if sample.ground_truth:
    #     all_labels.append(sample.ground_truth.detections[0].label)
    if sample.detections:
        all_labels.append(sample.detections.detections[0].label)

In [14]:
unique_labels = set(all_labels)
print(unique_labels)
print(len(unique_labels))

{'Tibetan_mastiff', 'Chesapeake_Bay_retriever', 'Airedale', 'Chesapeake Bay Retriever', 'Mexican_hairless', 'Greyhound', 'Labrador_retriever', 'bloodhound', 'Norfolk_terrier', 'kelpie', 'Siberian_husky', 'Basenji', 'Dalmatian', 'Collie', 'Border_terrier', 'silky_terrier', 'dhole', 'saluki', 'pug', 'Chihuahua', 'Irish_wolfhound', 'Yorkshire_terrier', 'Belgian Laekenois', 'vizsla', 'Clamber spaniel', 'beagle', 'West_Highland_white_terrier', 'Flat Coated Retriever', 'Bull Terrier', 'Jack Russell', 'rhodesian ridgeback', 'Lakeland_terrier', 'portuguese water dog', 'Alaskan Malamute', 'Lhasa Apso', 'redbone', 'otterhound', 'Australian Shepherd', 'Australian_terrier', 'Bouvier De Flandres', 'Border Collie', 'Afghan Hound', 'Cocker spaniel', 'Japanese Spitz', 'Golden Retriever', 'soft-coated_wheaten_terrier', 'standard_poodle', 'Cairn Terrier', 'Boston_bull', 'Irish_terrier', 'Scottish Terrier', 'pyrenean mountain dog', 'American Cocker Spaniel', 'Iris Terrier', 'American_Staffordshire_terrie

In [15]:
assert len(stanford_unique_labels.intersection(unique_labels)) == len(stanford_unique_labels)
assert len(roboflow_unique_labels.intersection(unique_labels)) == len(roboflow_unique_labels)

In [16]:
merged_dataset.export(
    export_dir="../data/opensource-dataset",
    dataset_type=fo.types.COCODetectionDataset,
)

Directory '../data/opensource-dataset' already exists; export will be merged with existing files
 100% |█████████████| 30529/30529 [21.0s elapsed, 0s remaining, 1.4K samples/s]       


## Clean space

In [17]:
print('Remove temp dirs')

shutil.rmtree(target_labels_dir.parent)

Remove temp dirs
