# Filter datasets

In this notebook example, we'll take a look at Datumaro filter api.

In [29]:
# Copyright (C) 2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

import os
import datumaro as dm
from datumaro.components.operations import compute_image_statistics, compute_ann_statistics

### Filtered by subset

We export sample VOC dataset to filter only train subset.

In [2]:
dataset = dm.Dataset.import_from('./tests/assets/voc_dataset/voc_dataset1', format='voc')

In [3]:
print('statistics for a sample VOC dataset')
compute_image_statistics(dataset)

statistics for a sample VOC dataset


{'dataset': {'images count': 2,
  'unique images count': 1,
  'repeated images count': 1,
  'repeated images': [[('2007_000001', 'train'), ('2007_000002', 'test')]]},
 'subsets': {'test': {'images count': 1,
   'image mean': [0.9999999999999971, 0.9999999999999971, 0.9999999999999971],
   'image std': [9.411065220006367e-08,
    9.411065220006367e-08,
    9.411065220006367e-08]},
  'train': {'images count': 1,
   'image mean': [0.9999999999999971, 0.9999999999999971, 0.9999999999999971],
   'image std': [9.411065220006367e-08,
    9.411065220006367e-08,
    9.411065220006367e-08]}}}

In VOC dataset, there are 'train' and 'test' subset. We will filter only 'train' subset.

In [4]:
dm.Dataset.filter(dataset, '/item[subset="train"]')

<datumaro.components.dataset.Dataset at 0x7f864f425a90>

In [5]:
print('statistics for train subset VOC dataset')
compute_image_statistics(dataset)

statistics for train subset VOC dataset


{'dataset': {'images count': 1,
  'unique images count': 1,
  'repeated images count': 0,
  'repeated images': []},
 'subsets': {'train': {'images count': 1,
   'image mean': [0.9999999999999971, 0.9999999999999971, 0.9999999999999971],
   'image std': [9.411065220006367e-08,
    9.411065220006367e-08,
    9.411065220006367e-08]}}}

### Filtered by id

We export sample widerface dataset to filter only dataset which id is `id=0_Parade_image_01`.

In [6]:
dataset = dm.Dataset.import_from('./tests/assets/widerface_dataset')

In [7]:
print('statistics for a sample WiderFace dataset')
compute_image_statistics(dataset)

statistics for a sample WiderFace dataset


{'dataset': {'images count': 3,
  'unique images count': 1,
  'repeated images count': 1,
  'repeated images': [[('0_Parade_image_01', 'train'),
    ('0_Parade_image_03', 'val'),
    ('1_Handshaking_image_02', 'train')]]},
 'subsets': {'val': {'images count': 1,
   'image mean': [0.9999999999999973, 0.9999999999999973, 0.9999999999999973],
   'image std': [9.058862863930295e-08,
    9.058862863930295e-08,
    9.058862863930295e-08]},
  'train': {'images count': 2,
   'image mean': [0.9999999999999973, 0.9999999999999973, 0.9999999999999973],
   'image std': [9.043701576544718e-08,
    9.043701576544718e-08,
    9.043701576544718e-08]}}}

In [8]:
dm.Dataset.filter(dataset, '/item[id="0_Parade_image_01"]')

<datumaro.components.dataset.Dataset at 0x7f864f3de850>

In [9]:
print('statistics for WiderFace dataset id == 1')
compute_image_statistics(dataset)

statistics for WiderFace dataset id == 1


{'dataset': {'images count': 1,
  'unique images count': 1,
  'repeated images count': 0,
  'repeated images': []},
 'subsets': {'train': {'images count': 1,
   'image mean': [0.9999999999999973, 0.9999999999999973, 0.9999999999999973],
   'image std': [9.058862863930295e-08,
    9.058862863930295e-08,
    9.058862863930295e-08]}}}

### Filtered by width and height

We export sample dataset to extract a dataset with images with width < height.

In [2]:
dataset = dm.Dataset.import_from('./tests/assets/coco_dataset/coco')



In [24]:
def get_width_height(dataset: dm.Dataset):
    size_dict = {}
    for item in dataset:
        size_dict[item.id] = item.media.size
    return size_dict

In [26]:
print('width and height for a sample coco dataset images')
get_width_height(dataset)

width and height for a sample coco dataset images


{'a': (5, 10), 'b': (10, 5)}

In [27]:
dm.Dataset.filter(dataset, '/item[image/width < image/height]')

<datumaro.components.dataset.Dataset at 0x7fdae0b713d0>

In [28]:
print('width and height for width < height coco dataset images')
get_width_height(dataset)

width and height for width < height coco dataset images


{'b': (10, 5)}

### Filtered by label and area

We export sample dataset to extract only non-`persons`.

In [14]:
dataset = dm.Dataset.import_from('./tests/assets/voc_dataset/voc_dataset1')

In [15]:
print('annotation count for voc dataset')
compute_ann_statistics(dataset)['annotations count']

annotation count for voc dataset


15

In [16]:
print('annotation statistics for voc dataset whose annotation is label!="person"')
compute_ann_statistics(dataset)['annotations']

annotation statistics for voc dataset whose annotation is label!="person"


{'labels': {'count': 15,
  'distribution': {'background': [0, 0.0],
   'aeroplane': [1, 0.06666666666666667],
   'bicycle': [1, 0.06666666666666667],
   'bird': [1, 0.06666666666666667],
   'boat': [0, 0.0],
   'bottle': [1, 0.06666666666666667],
   'bus': [0, 0.0],
   'car': [1, 0.06666666666666667],
   'cat': [1, 0.06666666666666667],
   'chair': [1, 0.06666666666666667],
   'cow': [0, 0.0],
   'diningtable': [1, 0.06666666666666667],
   'dog': [0, 0.0],
   'horse': [1, 0.06666666666666667],
   'motorbike': [0, 0.0],
   'person': [2, 0.13333333333333333],
   'pottedplant': [0, 0.0],
   'sheep': [1, 0.06666666666666667],
   'sofa': [0, 0.0],
   'train': [1, 0.06666666666666667],
   'tvmonitor': [0, 0.0],
   'ignored': [1, 0.06666666666666667],
   'head': [1, 0.06666666666666667],
   'hand': [0, 0.0],
   'foot': [0, 0.0]},
  'attributes': {'difficult': {'count': 2,
    'values count': 1,
    'values present': ['False'],
    'distribution': {'False': [2, 1.0]}},
   'truncated': {'count'

Indicate `filter_annotations` as `True` if filter needs to apply to annotations. The default value is `False` to items.

In [17]:
dm.Dataset.filter(dataset, '/item/annotation[label!="person"]', filter_annotations=True)

<datumaro.components.dataset.Dataset at 0x7f864f3a5e10>

In [18]:
print('annotation count for voc dataset whose annotation is label!="person"')
compute_ann_statistics(dataset)['annotations count']

annotation count for voc dataset whose annotation is label!="person"


13

In [19]:
compute_ann_statistics(dataset)['annotations']

{'labels': {'count': 13,
  'distribution': {'background': [0, 0.0],
   'aeroplane': [1, 0.07692307692307693],
   'bicycle': [1, 0.07692307692307693],
   'bird': [1, 0.07692307692307693],
   'boat': [0, 0.0],
   'bottle': [1, 0.07692307692307693],
   'bus': [0, 0.0],
   'car': [1, 0.07692307692307693],
   'cat': [1, 0.07692307692307693],
   'chair': [1, 0.07692307692307693],
   'cow': [0, 0.0],
   'diningtable': [1, 0.07692307692307693],
   'dog': [0, 0.0],
   'horse': [1, 0.07692307692307693],
   'motorbike': [0, 0.0],
   'person': [0, 0.0],
   'pottedplant': [0, 0.0],
   'sheep': [1, 0.07692307692307693],
   'sofa': [0, 0.0],
   'train': [1, 0.07692307692307693],
   'tvmonitor': [0, 0.0],
   'ignored': [1, 0.07692307692307693],
   'head': [1, 0.07692307692307693],
   'hand': [0, 0.0],
   'foot': [0, 0.0]},
  'attributes': {'difficult': {'count': 1,
    'values count': 1,
    'values present': ['False'],
    'distribution': {'False': [1, 1.0]}},
   'truncated': {'count': 1,
    'values

### Filtered by annotation

We export sample dataset to extract non-occluded annotations, remove empty images. Use data only from the “s1” source of the project.

In [20]:
dataset = dm.Dataset.import_from('./tests/assets/voc_dataset/voc_dataset1')

In [21]:
print('image statistics for sample voc dataset')
compute_image_statistics(dataset)

image statistics for sample voc dataset


{'dataset': {'images count': 2,
  'unique images count': 1,
  'repeated images count': 1,
  'repeated images': [[('2007_000001', 'train'), ('2007_000002', 'test')]]},
 'subsets': {'test': {'images count': 1,
   'image mean': [0.9999999999999971, 0.9999999999999971, 0.9999999999999971],
   'image std': [9.411065220006367e-08,
    9.411065220006367e-08,
    9.411065220006367e-08]},
  'train': {'images count': 1,
   'image mean': [0.9999999999999971, 0.9999999999999971, 0.9999999999999971],
   'image std': [9.411065220006367e-08,
    9.411065220006367e-08,
    9.411065220006367e-08]}}}

In [22]:
print('annotation statistics for sample voc dataset')
compute_ann_statistics(dataset)['annotations count']

annotation statistics for sample voc dataset


15

In [23]:
dm.Dataset.filter(dataset, '/item/annotation[occluded="False"]', filter_annotations=True, remove_empty=True)

<datumaro.components.dataset.Dataset at 0x7f868b978f90>

In [24]:
print('image statistics for non-occluded annotations and empty images removed voc dataset')
compute_image_statistics(dataset)

image statistics for non-occluded annotations and empty images removed voc dataset


{'dataset': {'images count': 1,
  'unique images count': 1,
  'repeated images count': 0,
  'repeated images': []},
 'subsets': {'train': {'images count': 1,
   'image mean': [0.9999999999999971, 0.9999999999999971, 0.9999999999999971],
   'image std': [9.411065220006367e-08,
    9.411065220006367e-08,
    9.411065220006367e-08]}}}

In [25]:
print('annotation statistics for non-occluded annotations and empty images removed voc dataset')
compute_ann_statistics(dataset)['annotations count']

annotation statistics for non-occluded annotations and empty images removed voc dataset


2