Construct a high quality, small dataset from iCubWorld Transformation

Receipe
- 16 images; 3K images for hold-out evaluation; 13K for training+testing
- Balanced dataset across all categories (and almost balanced across all objects)
- 20 categories -> $16000 / 20 = 800$ images per category of objects

- There are at 150 images for each category which the bbox is marked by human -> the cropped image has little background noise
- Remaining 650 images will be selected from each object (a total of 10 objects in each category) -> 65 images per object w/in the category.
- There should be no duplicate between each images

In [1]:
from pathlib import Path
import cv2

# images_dir = Path('dataset/icub/Images')
images_cropped_dir = Path('dataset/icub/ImagesCropped')
annotations_dir = Path('dataset/icub/Annotations_refined')
manual_subset_dir = Path('dataset/icub/Images_subset_manual')
manual_annotations_dir = Path('dataset/icub/Annotations_manual')

In [2]:
import os

# all categories
cats = os.listdir(annotations_dir)
cats = [cat for cat in cats if os.path.isdir(annotations_dir / cat)]
cats, len(cats)

(['squeezer',
  'book',
  'sprayer',
  'soapdispenser',
  'ringbinder',
  'perfume',
  'remote',
  'glass',
  'sunglasses',
  'mug',
  'sodabottle',
  'mouse',
  'cellphone',
  'pencilcase',
  'hairbrush',
  'wallet',
  'flower',
  'ovenglove',
  'bodylotion',
  'hairclip'],
 20)

Build the dataset from the ground up

In [3]:
import random

random.seed(20250425)

# first go to the manual subset
# and select the directories to be used

selected_img_manual_dirs = {}

for cat in cats:
    cat_dir = manual_subset_dir / cat
    if not os.path.exists(cat_dir):
        continue

    obj_dirs = os.listdir(cat_dir)
    # randomly select one dir from obj_dirs
    obj_dir = random.choice(obj_dirs)

    sub_dir_1 = os.listdir(cat_dir/obj_dir)[0]
    sub_dir_2 = os.listdir(cat_dir/obj_dir/sub_dir_1)[0]
    
    selected_img_manual_dirs[cat] = Path(cat).joinpath(obj_dir, sub_dir_1,sub_dir_2)

selected_img_manual_dirs, len(selected_img_manual_dirs)
     

({'book': PosixPath('book/book1/MIX/day5'),
  'sprayer': PosixPath('sprayer/sprayer2/MIX/day1'),
  'ringbinder': PosixPath('ringbinder/ringbinder5/MIX/day3'),
  'mug': PosixPath('mug/mug1/MIX/day5'),
  'sodabottle': PosixPath('sodabottle/sodabottle3/MIX/day1'),
  'pencilcase': PosixPath('pencilcase/pencilcase5/MIX/day5'),
  'wallet': PosixPath('wallet/wallet7/MIX/day3'),
  'flower': PosixPath('flower/flower9/MIX/day7'),
  'bodylotion': PosixPath('bodylotion/bodylotion5/MIX/day1'),
  'hairclip': PosixPath('hairclip/hairclip2/MIX/day7')},
 10)

In [4]:
images_manual_dict = {}

for cat in selected_img_manual_dirs:
    images_manual_dict[cat] = set()
    obj_dir = selected_img_manual_dirs[cat]
    images = list((manual_subset_dir.joinpath(obj_dir, 'left')).glob('*.jpg'))
    images_manual_dict[cat].update([img.name for img in images])


In [5]:
images_manual_dict

{'book': {'00005935.jpg',
  '00005936.jpg',
  '00005937.jpg',
  '00005938.jpg',
  '00005939.jpg',
  '00005940.jpg',
  '00005941.jpg',
  '00005942.jpg',
  '00005943.jpg',
  '00005944.jpg',
  '00005945.jpg',
  '00005946.jpg',
  '00005947.jpg',
  '00005948.jpg',
  '00005949.jpg',
  '00005950.jpg',
  '00005951.jpg',
  '00005952.jpg',
  '00005953.jpg',
  '00005954.jpg',
  '00005955.jpg',
  '00005956.jpg',
  '00005957.jpg',
  '00005958.jpg',
  '00005959.jpg',
  '00005960.jpg',
  '00005961.jpg',
  '00005962.jpg',
  '00005963.jpg',
  '00005964.jpg',
  '00005965.jpg',
  '00005966.jpg',
  '00005967.jpg',
  '00005968.jpg',
  '00005969.jpg',
  '00005970.jpg',
  '00005971.jpg',
  '00005972.jpg',
  '00005973.jpg',
  '00005974.jpg',
  '00005975.jpg',
  '00005976.jpg',
  '00005977.jpg',
  '00005978.jpg',
  '00005979.jpg',
  '00005980.jpg',
  '00005981.jpg',
  '00005982.jpg',
  '00005983.jpg',
  '00005984.jpg',
  '00005985.jpg',
  '00005986.jpg',
  '00005987.jpg',
  '00005988.jpg',
  '00005989.jpg',
  

Build a set including the images that should not exist in the images_dict for cropped images

remove images that appeared in the manual annotation set from the image_dict

In [6]:
from lxml import etree

images_dict = {}

for cat in cats:
    cat_dir = images_cropped_dir / cat

    selected_imgs_paths = []

    obj_dirs = sorted(os.listdir(cat_dir))

    images_dict[cat] = {}
    images_dict[cat]['obj_dirs'] = obj_dirs


for cat in cats:
    cat_dict = {}

    for sub_cat in images_dict[cat]['obj_dirs']:
        subcat_dict = {}

        days_dir = [x for x in Path.iterdir(Path(images_cropped_dir, cat, sub_cat, 'MIX')) if x.is_dir()]

        images_in_subcat = []
        annotations_in_subcat = []

        for day_dir in days_dir:
            # get all images' paths and find all files end with .jpg
            images = list(Path.iterdir(Path(day_dir, 'left')))
            images = [x for x in images if x.name.endswith('.jpg')]

            # get all annotations (xml file paths)
            annotations = sorted(Path.iterdir(Path(annotations_dir, cat, sub_cat, 'MIX', day_dir.name, 'left')))

            annotations_to_be_removed = []

            # check every annotations to find bboxes with negative values
            # remove the annotation and the corresponding image if there is a negative value
            for i, annotation in enumerate(annotations):
                tree = etree.parse(str(annotation))
                root = tree.getroot()
                for obj in root.iter('object'):
                    bndbox = obj.find('bndbox')

                    x_min = int(bndbox.find('xmin').text)
                    y_min = int(bndbox.find('ymin').text)
                    x_max = int(bndbox.find('xmax').text)
                    y_max = int(bndbox.find('ymax').text)

                    # check if the bbox is valid
                    if ((x_min < 0) or (y_min < 0) or (x_max < 0) or (y_max < 0)):
                        # remove the annotation from the list
                        annotations_to_be_removed.append(annotation)

            # remove the annotations from the list
            for annotation in annotations_to_be_removed:
                print(f"Removing {annotation} from annotations")
                annotations.remove(annotation)
                # remove the corresponding image from the list
                image_name = annotation.name[:-4] + '.jpg'
                image_path = Path.joinpath(images[0].parent, image_name)
                if image_path in images:
                    print(f"Removing {image_path} from images")
                    images.remove(image_path)

            # check if the images are in the excluded images list
            if cat in selected_img_manual_dirs.keys() and sub_cat == selected_img_manual_dirs[cat].parts[1] and \
                day_dir.name == selected_img_manual_dirs[cat].parts[3]:
                # remove the images from the list
                images = [x for x in images if x.name not in images_manual_dict[cat]]

            # if both have equal number of files -> normal
            # else: compare the two lists and find the difference
            if len(images) != len(annotations):

                image_set = set([x.name[:-4] for x in images])
                annotation_set = set([x.name[:-4] for x in annotations])
                diff = image_set ^ annotation_set

                # for every difference entry, check whether it is missin in image or annotation set
                for entry in diff:
                    if entry in image_set:      # there is an image but no annotation -> remove the image
                        # remove the image from the list
                        print(f"Removing {Path.joinpath(images[0].parent, entry + '.jpg')} from images")
                        images.remove(Path.joinpath(images[0].parent, entry + '.jpg'))

                    if entry in annotation_set:                       # there is an annotation but no image -> remove the annotation
                        # remove the annotation from the list
                        print(f"Removing {Path.joinpath(annotations[0].parent, entry + '.xml')} from annotations")
                        annotations.remove(Path.joinpath(annotations[0].parent, entry + '.xml'))

            assert len(images) == len(annotations), f"Number of images and annotations do not match in {cat}/{sub_cat}/{day_dir.name}. " + \
                        f"Number of images: {len(images)}, Number of annotations: {len(annotations)}. Difference in set: {set([x.name[:-4] for x in images]) ^ set([x.name[:-4] for x in annotations])}"
            
            # add the images and annotations to the list
            images_in_subcat += sorted(images)
            annotations_in_subcat += sorted(annotations)

        # add the images and annotations to the dictionary
        subcat_dict['images'] = images_in_subcat
        subcat_dict['annotations'] = annotations_in_subcat
        cat_dict[sub_cat] = subcat_dict
    images_dict[cat] = cat_dict


Removing dataset/icub/Annotations_refined/squeezer/squeezer10/MIX/day1/left/00007987.xml from annotations
Removing dataset/icub/ImagesCropped/squeezer/squeezer10/MIX/day1/left/00007987.jpg from images
Removing dataset/icub/Annotations_refined/squeezer/squeezer2/MIX/day1/left/00003882.xml from annotations
Removing dataset/icub/ImagesCropped/squeezer/squeezer2/MIX/day1/left/00003882.jpg from images
Removing dataset/icub/Annotations_refined/squeezer/squeezer3/MIX/day2/left/00002005.xml from annotations
Removing dataset/icub/ImagesCropped/squeezer/squeezer3/MIX/day2/left/00002005.jpg from images
Removing dataset/icub/Annotations_refined/squeezer/squeezer4/MIX/day1/left/00003292.xml from annotations
Removing dataset/icub/ImagesCropped/squeezer/squeezer4/MIX/day1/left/00003292.jpg from images
Removing dataset/icub/Annotations_refined/squeezer/squeezer5/MIX/day1/left/00003860.xml from annotations
Removing dataset/icub/ImagesCropped/squeezer/squeezer5/MIX/day1/left/00003860.jpg from images
Rem

In [7]:
images_dict['flower']['flower9']

{'images': [PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006168.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006169.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006170.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006171.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006172.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006173.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006174.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006175.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006176.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006177.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/MIX/day8/left/00006178.jpg'),
  PosixPath('dataset/icub/ImagesCropped/flower/flower9/

Dataset creation time

if the category has high quality annotated images
- 150 from that object with high quality cropped images; then 65 from each 10 obj -> total 800 images

if the category has no high quality annotated images
- Randomly select 80 images from each object

In [8]:
LABEL_TO_CAT = {i:cat for i, cat in enumerate(sorted(images_dict.keys()))}
CAT_TO_LABEL = {cat:i for i, cat in enumerate(sorted(images_dict.keys()))}
print(LABEL_TO_CAT); print(); print()
print(CAT_TO_LABEL)


{0: 'bodylotion', 1: 'book', 2: 'cellphone', 3: 'flower', 4: 'glass', 5: 'hairbrush', 6: 'hairclip', 7: 'mouse', 8: 'mug', 9: 'ovenglove', 10: 'pencilcase', 11: 'perfume', 12: 'remote', 13: 'ringbinder', 14: 'soapdispenser', 15: 'sodabottle', 16: 'sprayer', 17: 'squeezer', 18: 'sunglasses', 19: 'wallet'}


{'bodylotion': 0, 'book': 1, 'cellphone': 2, 'flower': 3, 'glass': 4, 'hairbrush': 5, 'hairclip': 6, 'mouse': 7, 'mug': 8, 'ovenglove': 9, 'pencilcase': 10, 'perfume': 11, 'remote': 12, 'ringbinder': 13, 'soapdispenser': 14, 'sodabottle': 15, 'sprayer': 16, 'squeezer': 17, 'sunglasses': 18, 'wallet': 19}


In [9]:
import random
random.seed(20250425)       # for reproducibility

my_icub_dataset = {}

for i, cat in LABEL_TO_CAT.items():
    my_icub_dataset[cat] = {}
    my_icub_dataset[cat]['images'] = []
    my_icub_dataset[cat]['annotations_manual'] = []
    my_icub_dataset[cat]['labels'] = []

    if cat in selected_img_manual_dirs.keys():
        # get the path to that these images
        manual_dir = manual_subset_dir.joinpath(selected_img_manual_dirs[cat], 'left')
        annotations_manual_dir = manual_annotations_dir.joinpath(selected_img_manual_dirs[cat])
        my_icub_dataset[cat]['images'].extend([manual_dir.joinpath(img_name) for img_name in images_manual_dict[cat]])
        my_icub_dataset[cat]['annotations_manual'].extend([annotations_manual_dir.joinpath(img_name[:-4] + '.xml') for img_name in images_manual_dict[cat]])

        for sub_cat in images_dict[cat].keys():
            my_icub_dataset[cat]['images'] += random.sample(images_dict[cat][sub_cat]['images'], 65)

        my_icub_dataset[cat]['labels'] = [i] * len(my_icub_dataset[cat]['images'])

    else:
        for sub_cat in images_dict[cat].keys():
            my_icub_dataset[cat]['images'] += random.sample(images_dict[cat][sub_cat]['images'], 80)

        my_icub_dataset[cat]['labels'] = [i] * len(my_icub_dataset[cat]['images'])


In [10]:
print("total number of images in the dataset: ", sum([len(my_icub_dataset[cat]['images']) for cat in my_icub_dataset.keys()]))

total number of images in the dataset:  16000


In [11]:
len(my_icub_dataset['bodylotion']['images'])

800

Create directories and other stuff to store our cropped images

In [12]:
import shutil

my_icub_dataset_new_path = {}

dataset_dir = Path('dataset/icub_custom_subset')
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

for cat in my_icub_dataset.keys():
    cat_dir = dataset_dir / cat
    if not os.path.exists(cat_dir):
        os.makedirs(cat_dir)

    my_icub_dataset_new_path[cat] = {}

for cat in my_icub_dataset.keys():
    cat_dir = dataset_dir / cat

    my_icub_dataset_new_path[cat]['images'] = []

    if cat in selected_img_manual_dirs.keys():
        for img_manual_path in my_icub_dataset[cat]['images'][:150]:
            annotations_dir = manual_annotations_dir.joinpath(Path(*img_manual_path.parts[3:-1]))
            annotation_path = annotations_dir.joinpath(img_manual_path.name[:-4] + '.xml')
            
            # read the annotation file and grab the bndbox
            # check if the annotation file exists
            if not os.path.exists(annotation_path):
                print(f"Annotation file {annotation_path} does not exist")
                continue

            # read the annotation file
            tree = etree.parse(str(annotation_path))
            root = tree.getroot()
            for obj in root.iter('object'):
                bndbox = obj.find('bndbox')

                x_min = int(bndbox.find('xmin').text)
                y_min = int(bndbox.find('ymin').text)
                x_max = int(bndbox.find('xmax').text)
                y_max = int(bndbox.find('ymax').text)

                # check if the bbox is valid
                if ((x_min < 0) or (y_min < 0) or (x_max < 0) or (y_max < 0)):
                    print(f"Invalid bbox in {annotation_path}")
                    continue

                # read the image and apply the bndbox
                img = cv2.imread(str(img_manual_path))
                img_cropped = img[y_min:y_max, x_min:x_max]

                # save the image# need to include the sub-dirs, as there seems images sharing the same name
                # structure: category/obj/day_N/
                # no need to include "left", as we only choose images from the left camera of the robot
                new_img_path_dir = cat_dir / img_manual_path.parts[4] / img_manual_path.parts[6]
                if not os.path.exists(new_img_path_dir): 
                    os.makedirs(new_img_path_dir)
                new_img_path = new_img_path_dir / img_manual_path.name
                cv2.imwrite(str(new_img_path), img_cropped)

                my_icub_dataset_new_path[cat]['images'].append(new_img_path)

        # the rest of the images
        # just copy the images to the new directory
        for img_path in my_icub_dataset[cat]['images'][150:]:
            img_name = img_path.name
            new_img_path_dir = cat_dir / img_path.parts[4] / img_path.parts[6]
            if not os.path.exists(new_img_path_dir): 
                os.makedirs(new_img_path_dir)
            new_img_path = new_img_path_dir / img_name
            shutil.copy2(img_path, new_img_path)

            my_icub_dataset_new_path[cat]['images'].append(new_img_path)
    else:
        # copy the images to the new directory
        for img_path in my_icub_dataset[cat]['images']:
            img_name = img_path.name
            new_img_path_dir = cat_dir / img_path.parts[4] / img_path.parts[6]
            if not os.path.exists(new_img_path_dir): 
                os.makedirs(new_img_path_dir)
            new_img_path = new_img_path_dir / img_name
            shutil.copy2(img_path, new_img_path)

            my_icub_dataset_new_path[cat]['images'].append(new_img_path)

print("Dataset created successfully!")

Dataset created successfully!


In [13]:
my_icub_dataset_new_path

{'bodylotion': {'images': [PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003844.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003771.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003775.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003777.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003810.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003795.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003817.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003845.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003789.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003858.jpg'),
   PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003776.jpg'),
   PosixPath('dataset/icub_custom_subset/

In [14]:
img_path.parts

('dataset',
 'icub',
 'ImagesCropped',
 'wallet',
 'wallet9',
 'MIX',
 'day4',
 'left',
 '00004538.jpg')

In [15]:
img_manual_path.parts

('dataset',
 'icub',
 'Images_subset_manual',
 'wallet',
 'wallet7',
 'MIX',
 'day3',
 'left',
 '00002302.jpg')

---

Pre-partition to train-test set and validation set

In [19]:
from copy import deepcopy

random.seed(20250425 + 235405)

train_test_set = []
train_test_set_labels = []
eval_set = []
eval_set_labels = []

for cat in my_icub_dataset_new_path.keys():
    cat_dir = dataset_dir / cat

    if cat in selected_img_manual_dirs.keys():
        # we need to select 28 images from the manual subset
        # and 122 images from the rest of the images
        # to form the eval set

        manual_images = deepcopy(my_icub_dataset_new_path[cat]['images'][:150])
        cropped_images = deepcopy(my_icub_dataset_new_path[cat]['images'][150:])
        # shuffle the images
        random.shuffle(manual_images)
        random.shuffle(cropped_images)

        # select 28 images from the manual subset to form the eval set
        eval_set += manual_images[:28]
        # select 122 images from the rest of the images to form the eval set
        eval_set += cropped_images[:122]

        # the rest of the images will be used for training
        train_test_set += manual_images[28:]
        train_test_set += cropped_images[122:]

        # the labels for the eval set
        eval_set_labels += [CAT_TO_LABEL[cat]] * 150
        # the labels for the train set
        train_test_set_labels += [CAT_TO_LABEL[cat]] * (len(my_icub_dataset_new_path[cat]['images']) - 150)
    else:
        # just randomly select 150 images from the dataset
        # and use them for eval set

        images = deepcopy(my_icub_dataset_new_path[cat]['images'])
        # shuffle the images
        random.shuffle(images)
        # select 150 images from the dataset to form the eval set
        eval_set += images[:150]
        # the rest of the images will be used for training
        train_test_set += images[150:]
        # the labels for the eval set
        eval_set_labels += [CAT_TO_LABEL[cat]] * 150
        # the labels for the train set
        train_test_set_labels += [CAT_TO_LABEL[cat]] * (len(my_icub_dataset_new_path[cat]['images']) - 150)
        
# check the number of images in the train and eval set
print("Number of images in the train set: ", len(train_test_set))
print("Number of images in the eval set: ", len(eval_set))
print("Number of images in the train set labels: ", len(train_test_set_labels))
print("Number of images in the eval set labels: ", len(eval_set_labels))

Number of images in the train set:  13000
Number of images in the eval set:  3000
Number of images in the train set labels:  13000
Number of images in the eval set labels:  3000


In [20]:
# save the train and eval set to a file
import pickle

train_test_set_path = dataset_dir / 'train_test_set.pkl'
eval_set_path = dataset_dir / 'eval_set.pkl'
with open(train_test_set_path, 'wb') as f:
    pickle.dump(train_test_set, f)
with open(eval_set_path, 'wb') as f:
    pickle.dump(eval_set, f)

# save the labels for the train and eval set to a file
train_test_set_labels_path = dataset_dir / 'train_test_set_labels.pkl'
eval_set_labels_path = dataset_dir / 'eval_set_labels.pkl'
with open(train_test_set_labels_path, 'wb') as f:
    pickle.dump(train_test_set_labels, f)
with open(eval_set_labels_path, 'wb') as f:
    pickle.dump(eval_set_labels, f)

In [21]:
train_test_set

[PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003761.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003827.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003785.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003835.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003778.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003814.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003886.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003852.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003845.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003782.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003824.jpg'),
 PosixPath('dataset/icub_custom_subset/bodylotion/bodylotion5/day1/00003806.jpg'),
 Pos