# Filter datasets

In this notebook example, we'll take a look at Datumaro filter api.

In [1]:
# Copyright (C) 2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

import os
import datumaro as dm

### Filtered by subset

We export sample VOC dataset to filter only train subset.

In [2]:
dataset = dm.Dataset.import_from('./tests/assets/voc_dataset/voc_dataset1', format='voc')

In [3]:
print('Representation for sample VOC dataset')
dataset

Representation for sample VOC dataset


Dataset
	size=2
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=15
subsets
	test: # of items=1, # of annotated items=0, # of annotations=0, annotation types=[]
	train: # of items=1, # of annotated items=1, # of annotations=15, annotation types=['label', 'bbox', 'mask']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []

In VOC dataset, there are 'train' and 'test' subset. We will filter only 'train' subset.

In [4]:
dataset = dm.Dataset.filter(dataset, '/item[subset="train"]')

In [5]:
print('Representation for `train` subset of sample VOC dataset')
dataset

Representation for `train` subset of sample VOC dataset


Dataset
	size=1
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=15
subsets
	train: # of items=1, # of annotated items=1, # of annotations=15, annotation types=['label', 'bbox', 'mask']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []

### Filtered by id

We export sample widerface dataset to filter only dataset which id is `id=0_Parade_image_01`.

In [6]:
dataset = dm.Dataset.import_from('./tests/assets/widerface_dataset')

In [7]:
print('Representation for sample WiderFace dataset')
dataset

Representation for sample WiderFace dataset


Dataset
	size=3
	source_path=./tests/assets/widerface_dataset
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=3
	annotations_count=9
subsets
	train: # of items=2, # of annotated items=2, # of annotations=5, annotation types=['label', 'bbox']
	val: # of items=1, # of annotated items=1, # of annotations=4, annotation types=['label', 'bbox']
categories
	label: ['Parade', 'Handshaking']

In [8]:
dataset = dm.Dataset.filter(dataset, '/item[id="0_Parade_image_01"]')

In [9]:
print('Representation for `id == 1` dataset of sample WiderFace dataset')
dataset

Representation for `id == 1` dataset of sample WiderFace dataset


Dataset
	size=1
	source_path=./tests/assets/widerface_dataset
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=2
subsets
	train: # of items=1, # of annotated items=1, # of annotations=2, annotation types=['label', 'bbox']
categories
	label: ['Parade', 'Handshaking']

### Filtered by width and height

We export sample dataset to extract a dataset with images with width < height.

In [10]:
dataset = dm.Dataset.import_from('./tests/assets/coco_dataset/coco')



In [11]:
def get_width_height(dataset: dm.Dataset):
    size_dict = {}
    for item in dataset:
        size_dict[item.id] = item.media.size
    return size_dict

In [12]:
print('Representation for sample COCO dataset')
dataset

Representation for sample COCO dataset


Dataset
	size=2
	source_path=./tests/assets/coco_dataset/coco
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=2
	annotations_count=6
subsets
	train: # of items=1, # of annotated items=1, # of annotations=2, annotation types=['bbox', 'caption']
	val: # of items=1, # of annotated items=1, # of annotations=4, annotation types=['mask', 'caption', 'polygon']
categories
	label: ['a', 'b', 'c']

In [13]:
print('Width and height for sample COCO dataset images')
get_width_height(dataset)

Width and height for sample COCO dataset images


{'a': (5, 10), 'b': (10, 5)}

In [14]:
dataset = dm.Dataset.filter(dataset, '/item[image/width < image/height]')

In [15]:
print('Representation for `width < height` sample COCO dataset images')
dataset

Representation for `width < height` sample COCO dataset images


Dataset
	size=1
	source_path=./tests/assets/coco_dataset/coco
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=4
subsets
	val: # of items=1, # of annotated items=1, # of annotations=4, annotation types=['mask', 'caption', 'polygon']
categories
	label: ['a', 'b', 'c']

In [16]:
print('Width and height for `width < height` sample COCO dataset images')
get_width_height(dataset)

Width and height for `width < height` sample COCO dataset images


{'b': (10, 5)}

### Filtered by label and area

We export sample dataset to extract only non-`persons`.

In [17]:
dataset = dm.Dataset.import_from('./tests/assets/voc_dataset/voc_dataset1')

In [18]:
print('Representation for sample VOC dataset')
dataset

Representation for sample VOC dataset


Dataset
	size=2
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=15
subsets
	test: # of items=1, # of annotated items=0, # of annotations=0, annotation types=[]
	train: # of items=1, # of annotated items=1, # of annotations=15, annotation types=['label', 'bbox', 'mask']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []

Indicate `filter_annotations` as `True` if filter needs to apply to annotations. The default value is `False` to items.

In [19]:
dataset = dm.Dataset.filter(dataset, '/item/annotation[label!="person"]', filter_annotations=True)

In [20]:
print('Representation for sample VOC dataset whose annotation is `label!="person"`')
dataset

Representation for sample VOC dataset whose annotation is `label!="person"`


Dataset
	size=2
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=13
subsets
	test: # of items=1, # of annotated items=0, # of annotations=0, annotation types=[]
	train: # of items=1, # of annotated items=1, # of annotations=13, annotation types=['label', 'bbox', 'mask']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []

### Filtered by annotation

We export sample dataset to extract non-occluded annotations, remove empty images. Use data only from the “s1” source of the project.

In [21]:
dataset = dm.Dataset.import_from('./tests/assets/voc_dataset/voc_dataset1')

In [22]:
print('Representation for sample VOC dataset')
dataset

Representation for sample VOC dataset


Dataset
	size=2
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=15
subsets
	test: # of items=1, # of annotated items=0, # of annotations=0, annotation types=[]
	train: # of items=1, # of annotated items=1, # of annotations=15, annotation types=['label', 'bbox', 'mask']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []

In [23]:
dm.Dataset.filter(dataset, '/item/annotation[occluded="False"]', filter_annotations=True, remove_empty=True)

Dataset
	size=1
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=2
subsets
	train: # of items=1, # of annotated items=1, # of annotations=2, annotation types=['bbox']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []

In [24]:
print('Representation for `non-occluded annotations and empty images removed sample VOC dataset`')
dataset

Representation for `non-occluded annotations and empty images removed sample VOC dataset`


Dataset
	size=1
	source_path=./tests/assets/voc_dataset/voc_dataset1
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=1
	annotations_count=2
subsets
	train: # of items=1, # of annotated items=1, # of annotations=2, annotation types=['bbox']
categories
	label: ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'ignored', 'head', 'hand', 'foot']
	mask: []