In [1]:
# !pip install fiftyone

In [2]:
import os
import re
import shutil
import fiftyone as fo
from pathlib import Path

from tqdm import tqdm

# Convert Stanford Dogs Dataset

In [3]:
stanford_root = Path('../raw_data/stanford-dogs-dataset/')
target_dataset_path = Path('../data/stanford-dogs-dataset/')

In [4]:
print('Copy images')
target_images_dir = target_dataset_path / 'images'
src_images_dir = stanford_root / 'images'

os.makedirs(target_images_dir, exist_ok=True)

for temp_path in tqdm(src_images_dir.rglob('*.jpg')):
    dst_file_path = target_images_dir / temp_path.name
    shutil.copy(temp_path, dst_file_path)

Copy images


20580it [00:06, 3294.24it/s]


In [5]:
print('Copy labels')
target_labels_dir = target_dataset_path / 'labels'
src_labels_dir = stanford_root / 'annotations'

pattern = "n[0-9]{8}_[0-9]+"

os.makedirs(target_labels_dir, exist_ok=True)

for temp_path in tqdm(src_labels_dir.rglob('*')):
    if temp_path.name.startswith('n') and re.match(pattern, temp_path.name):
        dst_file_path = target_labels_dir / (temp_path.name + '.xml')
        shutil.copy(temp_path, dst_file_path)

Copy labels


20703it [00:04, 4916.86it/s]


In [6]:
# Load PascalVOC dataset
stanford_dogs_dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.VOCDetectionDataset,
    labels_path=target_labels_dir,
    data_path=target_images_dir,
    label_field="detections",
)

 100% |█████████████| 20580/20580 [14.1s elapsed, 0s remaining, 1.5K samples/s]      


In [7]:
stanford_labels = []

# Iterate through samples to collect the labels
for sample in stanford_dogs_dataset:
    stanford_labels.append(sample.detections.detections[0].label)

In [8]:
stanford_unique_labels = set(stanford_labels)
print(stanford_unique_labels)
print(len(stanford_unique_labels))

{'keeshond', 'bloodhound', 'curly-coated_retriever', 'Sussex_spaniel', 'chow', 'Kerry_blue_terrier', 'Boston_bull', 'miniature_poodle', 'Gordon_setter', 'Pomeranian', 'Pembroke', 'Norwegian_elkhound', 'malinois', 'dhole', 'boxer', 'whippet', 'Walker_hound', 'Bernese_mountain_dog', 'vizsla', 'flat-coated_retriever', 'Irish_terrier', 'bull_mastiff', 'Great_Dane', 'beagle', 'pug', 'golden_retriever', 'Chihuahua', 'Irish_water_spaniel', 'toy_poodle', 'standard_schnauzer', 'bluetick', 'Rhodesian_ridgeback', 'Bedlington_terrier', 'giant_schnauzer', 'German_short-haired_pointer', 'English_springer', 'Norwich_terrier', 'basenji', 'silky_terrier', 'Yorkshire_terrier', 'Leonberg', 'cairn', 'collie', 'black-and-tan_coonhound', 'Samoyed', 'Doberman', 'Great_Pyrenees', 'miniature_pinscher', 'kelpie', 'Shetland_sheepdog', 'kuvasz', 'Eskimo_dog', 'Greater_Swiss_Mountain_dog', 'English_setter', 'briard', 'Airedale', 'soft-coated_wheaten_terrier', 'dingo', 'borzoi', 'Irish_wolfhound', 'basset', 'papill

# Convert computer-science-gtrnn/dog-breeds

In [9]:
roboflow_dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.COCODetectionDataset,
    labels_path='../raw_data/big-dog-breeds-ds/train/_annotations.coco.json',
    data_path='../raw_data/big-dog-breeds-ds/train/',
)

 100% |███████████████| 9949/9949 [6.2s elapsed, 0s remaining, 1.6K samples/s]      


In [10]:
roboflow_labels = []

# Iterate through samples to collect the labels
for sample in roboflow_dataset.iter_samples():
    if sample.detections:
        roboflow_labels.append(sample.detections.detections[0].label)

In [11]:
roboflow_unique_labels = set(roboflow_labels)
print(roboflow_unique_labels)
print(len(roboflow_unique_labels))

{'English Springer Spaniel', 'American Cocker Spaniel', 'Brittany', 'Shih Tzu', 'Australian Silky Terrier', 'Irish Wolfhound', 'Collie', 'Italian Greyhound', 'Iris Terrier', 'Golden Retriever', 'Pomeranian', 'Clamber spaniel', 'Bullmanstiff', 'Curly coated retriever', 'Rottwelier', 'Irish setter', 'Basset Fauve De Bretagne', 'Australian Cattle', 'Australian Terrier', 'French bulldog', 'rhodesian ridgeback', 'pug', 'Chinese Crested', 'puli', 'Chihuahua', 'saint bernard', 'Schipperke', 'Bedlington Terrier', 'Gordon Setter', 'Sealyham terrier', 'english toy terrier', 'Boston Terrier', 'German Wirehaired Pointer', 'Griffon Bruxellois', 'Afghan Hound', 'Samoyed', 'Bichon Frise', 'English Setter', 'Dandie Dinmont Terrier', 'Belgian Laekenois', 'Bull Terrier', 'Dalmatian', 'saluki', 'Bloodhound', 'Texas heeler', 'Shetland sheepdog', 'finnish spitz', 'Boxer', 'German Short-Haired Pointer', 'Dachshund', 'Greyhound', 'Affenpinscher', 'Borzoi', 'Schnauzer', 'Basenji', 'Cairn Terrier', 'Bearded Co

## Merge datasets and export

In [12]:
merged_dataset_name = "merged_dataset"

if fo.dataset_exists(merged_dataset_name):
    fo.delete_datasets(merged_dataset_name)

merged_dataset = fo.Dataset(merged_dataset_name)

merged_dataset.merge_samples(roboflow_dataset)
merged_dataset.merge_samples(stanford_dogs_dataset)

In [13]:
all_labels = []

# Iterate through samples to collect the labels
for sample in merged_dataset.iter_samples():
    # if sample.ground_truth:
    #     all_labels.append(sample.ground_truth.detections[0].label)
    if sample.detections:
        all_labels.append(sample.detections.detections[0].label)

In [14]:
unique_labels = set(all_labels)
print(unique_labels)
print(len(unique_labels))

{'keeshond', 'bloodhound', 'curly-coated_retriever', 'English Springer Spaniel', 'American Cocker Spaniel', 'Brittany', 'Shih Tzu', 'Australian Silky Terrier', 'Sussex_spaniel', 'chow', 'Irish Wolfhound', 'Collie', 'Italian Greyhound', 'Kerry_blue_terrier', 'Iris Terrier', 'Boston_bull', 'miniature_poodle', 'Gordon_setter', 'Golden Retriever', 'Pomeranian', 'Clamber spaniel', 'Norwegian_elkhound', 'malinois', 'Pembroke', 'Bullmanstiff', 'dhole', 'Curly coated retriever', 'boxer', 'Rottwelier', 'Irish setter', 'Basset Fauve De Bretagne', 'whippet', 'Australian Cattle', 'Walker_hound', 'Australian Terrier', 'French bulldog', 'Bernese_mountain_dog', 'rhodesian ridgeback', 'flat-coated_retriever', 'vizsla', 'pug', 'Irish_terrier', 'Chinese Crested', 'puli', 'beagle', 'bull_mastiff', 'golden_retriever', 'Great_Dane', 'Chihuahua', 'Irish_water_spaniel', 'toy_poodle', 'saint bernard', 'standard_schnauzer', 'Schipperke', 'Bedlington Terrier', 'Gordon Setter', 'bluetick', 'Sealyham terrier', 'e

In [15]:
assert len(stanford_unique_labels.intersection(unique_labels)) == len(stanford_unique_labels)
assert len(roboflow_unique_labels.intersection(unique_labels)) == len(roboflow_unique_labels)

## Merge datasets with scrapped data

In [16]:
scrapped_dataset_name = 'scrapped_dataset'

if fo.dataset_exists(scrapped_dataset_name):
    fo.delete_datasets(scrapped_dataset_name)

scrapped_dataset = merged_dataset.clone(scrapped_dataset_name)
scrapped_dataset.clear()

In [17]:
scrapped_images_dir = Path('../data/raw_scrapped_data/')

In [18]:
for img_path in scrapped_images_dir.iterdir():
    sample = fo.Sample(img_path)
    scrapped_dataset.add_sample(sample)

In [19]:
merged_dataset.merge_samples(scrapped_dataset)

In [20]:
merged_dataset.export(
    export_dir="../data/opensource-dataset",
    dataset_type=fo.types.COCODetectionDataset,
)

Directory '../data/opensource-dataset' already exists; export will be merged with existing files
 100% |█████████████| 33995/33995 [25.7s elapsed, 0s remaining, 599.8 samples/s]      


## Clean space

In [21]:
print('Remove temp dirs')

shutil.rmtree(target_labels_dir.parent)

Remove temp dirs
