# Datumaro

This is sample code for a dataset preprocessing library. Its format can be exported directly from CVAT. It's specially useful when:

    - Trying to export labels attributes

In [1]:
import datumaro as dm
import datumaro.plugins.splitter as splitter
from datumaro.plugins.validators import DetectionValidator

from datumaro_funcs import remap_label_by_attribute, convert_rle_to_polygons, remove_categories, count_instance_labels

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = dm.Dataset.import_from('../../data/stain_lcrack_dataset')

In [3]:
dataset

Dataset
	size=3965
	source_path=../../data/stain_lcrack_dataset
	media_type=<class 'datumaro.components.media.Image'>
	ann_types={<AnnotationType.mask: 2>, <AnnotationType.bbox: 6>}
	annotated_items_count=1248
	annotations_count=1528
subsets
	default: # of items=3965, # of annotated items=1248, # of annotations=1528
infos
	categories
	label: ['algae', 'major_crack', 'minor_crack', 'peeling', 'spalling', 'stain', 'normal']
	points: []

In [4]:
validator = DetectionValidator() # Or ClassificationValidator or SegementationValidator

reports = validator.validate(dataset)

# No puedo quitarme las que no están anotadas porque me quitaría las de cls04 normal
# Selecciono por clase
dataset = dataset.select(lambda item: ('cls04' in item.id) or ('cls06' in item.id) or ('cls02'))

In [5]:
# Check for non-mask annotations and remove those samples
nonmask_ids = []
for item in dataset:
    for ann in item.annotations:
        if ann.type == dm.AnnotationType.mask:
            continue
        elif ann.type == dm.AnnotationType.polygon:
            print("  - Polygon detected")
        elif ann.type == dm.AnnotationType.bbox:
            print(f"  - Bounding Box: {ann.points}")  # Print bbox coordinates
        print(f"  ID: {item.id}")
        print(f"  Image: {item.media}")
        print(f"  Annotation Type: {ann.type}")
        print(f"  Label: {dataset.categories()[dm.AnnotationType.label].items[ann.label].name}")
        print(f"  Attributes: {ann.attributes}")
        nonmask_ids.append(item.id)
        
# There are only two images with non-mask annotations
#dataset = dataset.select(lambda item: item.id not in ["226", "WhatsApp Image 2024-10-01 at 16.08.59"])

  - Bounding Box: [5.190000057220459, 10.40999984741211, 494.4800109863281, 507.05999755859375]
  ID: cls00_001
  Image: ImageFromFile(path='../../data/stain_lcrack_dataset/images/default/cls00_001.jpg')
  Annotation Type: 6
  Label: algae
  Attributes: {'occluded': False, 'rotation': 0.0}
  - Bounding Box: [218.27999877929688, 161.17999267578125, 465.67999267578125, 392.6300048828125]
  ID: cls00_002
  Image: ImageFromFile(path='../../data/stain_lcrack_dataset/images/default/cls00_002.jpg')
  Annotation Type: 6
  Label: algae
  Attributes: {'occluded': False, 'rotation': 0.0}
  - Bounding Box: [0.8899999856948853, 1.809999942779541, 491.4100036621094, 487.4200134277344]
  ID: cls00_003
  Image: ImageFromFile(path='../../data/stain_lcrack_dataset/images/default/cls00_003.jpg')
  Annotation Type: 6
  Label: algae
  Attributes: {'occluded': False, 'rotation': 0.0}
  - Bounding Box: [4.579999923706055, 0.0, 512.0, 481.8900146484375]
  ID: cls00_004
  Image: ImageFromFile(path='../../data/

In [6]:
# Now convert all the maks to polygons and check there is no mask left
dataset = convert_rle_to_polygons(dataset)

# Check if there are any masks left
for item in dataset:
    for ann in item.annotations:
        if ann.type == dm.AnnotationType.mask:
            print(f"  - Mask")
        elif ann.type == dm.AnnotationType.polygon:
            continue
        elif ann.type == dm.AnnotationType.bbox:
            continue  # Print bbox coordinates
        print(f"  ID: {item.id}")
        print(f"  Image: {item.media}")
        print(f"  Annotation Type: {ann.type}")
        print(f"  Label: {dataset.categories()[dm.AnnotationType.label].items[ann.label].name}")
        print(f"  Attributes: {ann.attributes}")

100%|██████████| 3965/3965 [00:01<00:00, 2312.42it/s] 


In [7]:
# Quito todas las categorías excepto stain y minor_crack
# Elimino las normal porque en mi formato de entrenamiento no taggeo las paredes normales, simplemente no etiqueto
dataset = remove_categories(dataset, ['algae', 'major_crack', 'peeling', 'spalling', 'normal'])

In [8]:
count_instance_labels(dataset)

{'minor_crack': 1753, 'stain': 4290}

In [9]:
dataset

Dataset
	size=3965
	source_path=None
	media_type=<class 'datumaro.components.media.Image'>
	ann_types={<AnnotationType.polygon: 4>}
	annotated_items_count=1143
	annotations_count=6043
subsets
	default: # of items=3965, # of annotated items=1143, # of annotations=6043
infos
	categories
	label: ['minor_crack', 'stain']
	points: []

In [10]:
# He movido la celda de split al final porque es lo último que se debería hacer, cuando ya no cambian las imágenes seleccionadas
# Separate in train and validation
splits = [("train", 0.8), ("val", 0.2)]
task = splitter.SplitTask.segmentation.name

resplitted = dataset.transform("split", task=task, splits=splits)

In [11]:
# Export the dataset
resplitted.export("/datassd/home/plazaro/detrex_ITA/detectron2/datasets/db3_stain_lcrack", "coco", save_media=True, segmentation_mode="polygons", 
                  allow_attributes=False)

In [12]:
resplitted

Dataset
	size=3965
	source_path=/datassd/home/plazaro/detrex_ITA/detectron2/datasets/db3_stain_lcrack
	media_type=<class 'datumaro.components.media.Image'>
	ann_types={<AnnotationType.polygon: 4>}
	annotated_items_count=1143
	annotations_count=6043
subsets
	train: # of items=3172, # of annotated items=1077, # of annotations=4834
	val: # of items=793, # of annotated items=66, # of annotations=1209
infos
	categories
	label: ['minor_crack', 'stain']
	points: []

In [15]:
len(count_instance_labels(resplitted))

1