# Datumaro

This is sample code for a dataset preprocessing library. Its format can be exported directly from CVAT. It's specially useful when:

    - Trying to export labels attributes

In [1]:
import datumaro as dm
import datumaro.plugins.splitter as splitter
from datumaro.plugins.validators import DetectionValidator

from datumaro_funcs import remap_label_by_attribute, convert_rle_to_polygons, remove_categories, count_instance_labels

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = dm.Dataset.import_from('../../data/stain_dataset')

In [3]:
dataset

Dataset
	size=3965
	source_path=../../data/stain_dataset
	media_type=<class 'datumaro.components.media.Image'>
	ann_types={<AnnotationType.mask: 2>, <AnnotationType.bbox: 6>}
	annotated_items_count=684
	annotations_count=887
subsets
	default: # of items=3965, # of annotated items=684, # of annotations=887
infos
	categories
	label: ['algae', 'major_crack', 'minor_crack', 'peeling', 'spalling', 'stain', 'normal']
	points: []

In [5]:
validator = DetectionValidator() # Or ClassificationValidator or SegementationValidator

reports = validator.validate(dataset)

# No puedo quitarme las que no están anotadas porque me quitaría las de cls04 normal
# Selecciono por clase
dataset = dataset.select(lambda item: ('cls04' in item.id) or ('cls06' in item.id))

In [7]:
# Check for non-mask annotations and remove those samples
nonmask_ids = []
for item in dataset:
    for ann in item.annotations:
        if ann.type == dm.AnnotationType.mask:
            continue
        elif ann.type == dm.AnnotationType.polygon:
            print("  - Polygon detected")
        elif ann.type == dm.AnnotationType.bbox:
            print(f"  - Bounding Box: {ann.points}")  # Print bbox coordinates
        print(f"  ID: {item.id}")
        print(f"  Image: {item.media}")
        print(f"  Annotation Type: {ann.type}")
        print(f"  Label: {dataset.categories()[dm.AnnotationType.label].items[ann.label].name}")
        print(f"  Attributes: {ann.attributes}")
        nonmask_ids.append(item.id)
        
# There are only two images with non-mask annotations
#dataset = dataset.select(lambda item: item.id not in ["226", "WhatsApp Image 2024-10-01 at 16.08.59"])

In [8]:
# Now convert all the maks to polygons and check there is no mask left
dataset = convert_rle_to_polygons(dataset)

# Check if there are any masks left
for item in dataset:
    for ann in item.annotations:
        if ann.type == dm.AnnotationType.mask:
            print(f"  - Mask")
        elif ann.type == dm.AnnotationType.polygon:
            continue
        elif ann.type == dm.AnnotationType.bbox:
            continue  # Print bbox coordinates
        print(f"  ID: {item.id}")
        print(f"  Image: {item.media}")
        print(f"  Annotation Type: {ann.type}")
        print(f"  Label: {dataset.categories()[dm.AnnotationType.label].items[ann.label].name}")
        print(f"  Attributes: {ann.attributes}")

  0%|          | 0/1121 [00:00<?, ?it/s]

100%|██████████| 1121/1121 [00:00<00:00, 1187.94it/s]


In [9]:
# Quito todas las categorías excepto stain
dataset = remove_categories(dataset, ['algae', 'major_crack', 'minor_crack', 'peeling', 'spalling', 'normal'])

In [10]:
count_instance_labels(dataset)

{'stain': 4290}

In [11]:
dataset

Dataset
	size=1121
	source_path=None
	media_type=<class 'datumaro.components.media.Image'>
	ann_types={<AnnotationType.polygon: 4>}
	annotated_items_count=519
	annotations_count=4290
subsets
	default: # of items=1121, # of annotated items=519, # of annotations=4290
infos
	categories
	label: ['stain']
	points: []

In [12]:
# He movido la celda de split al final porque es lo último que se debería hacer, cuando ya no cambian las imágenes seleccionadas
# Separate in train and validation
splits = [("train", 0.8), ("val", 0.2)]
task = splitter.SplitTask.segmentation.name

resplitted = dataset.transform("split", task=task, splits=splits)

In [13]:
# Export the dataset
resplitted.export("/datassd/home/plazaro/detrex_ITA/detectron2/datasets/db3_stain", "coco", save_media=True, segmentation_mode="polygons", 
                  allow_attributes=False)

In [14]:
resplitted

Dataset
	size=1121
	source_path=/datassd/home/plazaro/detrex_ITA/detectron2/datasets/db3_stain
	media_type=<class 'datumaro.components.media.Image'>
	ann_types={<AnnotationType.polygon: 4>}
	annotated_items_count=519
	annotations_count=4290
subsets
	train: # of items=897, # of annotated items=485, # of annotations=3432
	val: # of items=224, # of annotated items=34, # of annotations=858
infos
	categories
	label: ['stain']
	points: []

In [15]:
len(count_instance_labels(resplitted))

1