# Transform datasets

In this notebook example, we'll take a look at Datumaro transform api, where transform provides the task changes by modifying the annotation style, e.g., from masks to polygons, from bounding boxes to masks, from shapes to bounding boxes, etc.

In [1]:
# Copyright (C) 2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

import os
import datumaro as dm

### Filtered by subset

We export sample VOC dataset to filter only train subset.

In [49]:
dataset = dm.Dataset.import_from('coco_dataset', format='coco_instances')

print('Representation for sample COCO dataset')
dataset



Representation for sample COCO dataset


Dataset
	size=123287
	source_path=coco_dataset
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=122218
	annotations_count=1018861
subsets
	train2017: # of items=118287, # of annotated items=117266, # of annotations=976995, annotation types=['polygon', 'mask']
	val2017: # of items=5000, # of annotated items=4952, # of annotations=41866, annotation types=['polygon', 'mask']
categories
	label: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizz

In [50]:
subsets = list(dataset.subsets().keys())
print("Subset candidates:", subsets)

def get_ids(dataset: dm.Dataset, subset: str):
    ids = []
    for item in dataset:
        if item.subset == subset:
            ids += [item.id]
    
    return ids

ids = get_ids(dataset, subsets[0])

Subset candidates: ['val2017', 'train2017']


In VOC dataset, there are 'train' and 'test' subset. We will filter only 'train' subset.

In [35]:
# dataset.transform("masks_to_polygons")
reindexing_dataset = dataset.transform("reindex", start=0)

ids = get_ids(reindexing_dataset, subsets[0])

In [37]:
rollback_dataset = dataset.transform("id_from_image_name")

ids = get_ids(rollback_dataset, subsets[0])
print(dataset)

Dataset
	size=123287
	source_path=coco_dataset
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=122218
	annotations_count=1018861
subsets
	train2017: # of items=118287, # of annotated items=117266, # of annotations=976995, annotation types=['polygon', 'mask']
	val2017: # of items=5000, # of annotated items=4952, # of annotations=41866, annotation types=['polygon', 'mask']
categories
	label: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizz

In [48]:
mapping = {"motorcycle": "bicycle", "bus": "car", "truck": "car"}
remap_label_dataset = dataset.transform("remap_labels", mapping=mapping)
remap_label_dataset

TypeError: __init__() got an unexpected keyword argument 'regex'

In [51]:
print(dataset)

Dataset
	size=123287
	source_path=coco_dataset
	media_type=<class 'datumaro.components.media.Image'>
	annotated_items_count=122218
	annotations_count=1018861
subsets
	train2017: # of items=118287, # of annotated items=117266, # of annotations=976995, annotation types=['polygon', 'mask']
	val2017: # of items=5000, # of annotated items=4952, # of annotations=41866, annotation types=['polygon', 'mask']
categories
	label: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizz

In [45]:
# strr = '|\1|^image_|'
# renamed_dataset = dataset.transform("rename", regex=strr)
# print(renamed_dataset)

# ids = get_ids(dataset, subsets[0])
# print('val2017', ids)

TypeError: __init__() got an unexpected keyword argument 'regex'

In [8]:
from datumaro.components.visualizer import Visualizer

visualizer = Visualizer(dataset, figsize=(8, 8), alpha=0.7)
fig = visualizer.vis_gallery(ids[:4], subsets[0], (2, 2))
fig.show()