In [1]:
import datumaro

In [4]:
from datumaro.components.dataset import Dataset

In [6]:
from texttable import Texttable

def get_dataset_info(dataset):
    return [
        len(dataset._data),
        dataset._source_path,
        dataset.media_type(),
        dataset.get_annotated_items(),
        dataset.get_annotations(),
        "\n".join(dataset.get_subset_info()),
        "\n".join(dataset.get_infos()),
        "\n".join(dataset.get_categories_info())
    ]


src_path = "/home/jeom/datum-ws/datumaro/data/cifar10"
tgt_path = "/home/jeom/datum-ws/datumaro/data/mnist"

src_dataset = Dataset.import_from(src_path)
tgt_dataset = Dataset.import_from(tgt_path)

src_subsets = sorted(src_dataset.subsets().keys())
tgt_subsets = sorted(tgt_dataset.subsets().keys())

def create_comparison_table(src_dataset, tgt_dataset):
    table = Texttable()
    table.set_cols_align(["l", "c", "c"])
    table.set_cols_valign(["m", "m", "m"])
    table.add_rows([
        ["Datum compare", "src_dataset", "tgt_dataset"],
        ["import path", src_path, tgt_path],
        ["data format", src_dataset.format, tgt_dataset.format],
        ["# of data", len(src_dataset), len(tgt_dataset)],
        ["subset", sorted(src_dataset.subsets().keys()),
                   sorted(tgt_dataset.subsets().keys())],
        ["subset size", [len(src_dataset.get_subset(s)) for s in src_subsets], 
                        [len(tgt_dataset.get_subset(s)) for s in tgt_subsets]],
        ["categories", [item.name for item in list(src_dataset.categories().values())[0].items],
                       [item.name for item in list(tgt_dataset.categories().values())[0].items]],
        ["annotation type", [list(src_dataset.categories().keys())[0]],
                            [list(src_dataset.categories().keys())[0]]],
#         ["label distribution", cls_val.compute_statistics(src_dataset)['label_distribution'],
#                                cls_val.compute_statistics(src_dataset)['label_distribution']]

    ])
    return table.draw()

print(create_comparison_table(src_dataset, tgt_dataset))


+-----------------+------------------------------+-----------------------------+
|  Datum compare  |         src_dataset          |         tgt_dataset         |
| import path     |      /home/jeom/datum-       |      /home/jeom/datum-      |
|                 |   ws/datumaro/data/cifar10   |   ws/datumaro/data/mnist    |
+-----------------+------------------------------+-----------------------------+
| data format     |            cifar             |            mnist            |
+-----------------+------------------------------+-----------------------------+
| # of data       |              50              |             20              |
+-----------------+------------------------------+-----------------------------+
|                 |       ['data_batch_1',       |                             |
|                 |       'data_batch_2',        |                             |
| subset          |       'data_batch_3',        |      ['test', 'train']      |
|                 |       'd

In [35]:
from datumaro.components.operations import compute_image_statistics, compute_ann_statistics, match_classes
print("data path")
print(src_path)
print()
print("data format")
print(src_dataset.format)
print()
print("image")
print(compute_image_statistics(src_dataset))
print()
print("ann")
print(compute_ann_statistics(src_dataset))
print()
print("class")
print(match_classes(src_dataset.categories(), tgt_dataset.categories())) # returns matched, src cls, det_cls

data path
/home/jeom/datum-ws/datumaro/data/cifar10

data format
cifar

image
{'dataset': {'images count': 50, 'unique images count': 50, 'repeated images count': 0, 'repeated images': []}, 'subsets': {'data_batch_5': {'images count': 10, 'image mean': [113.0396728515626, 120.70520019531253, 127.95635986328122], 'image std': [62.67480968003201, 57.52452272613527, 55.33193741534145]}, 'data_batch_2': {'images count': 10, 'image mean': [133.0299682617188, 140.6387939453124, 148.81463623046875], 'image std': [67.88731946781247, 64.84952813588434, 64.10208104112289]}, 'data_batch_4': {'images count': 10, 'image mean': [117.80895996093749, 125.73950195312493, 130.8768310546875], 'image std': [66.79783557606551, 59.63731010805518, 61.639642862833036]}, 'data_batch_3': {'images count': 10, 'image mean': [130.73168945312506, 136.8297119140625, 136.37158203125003], 'image std': [72.0621149322653, 71.17281956571303, 72.0073115111266]}, 'data_batch_1': {'images count': 10, 'image mean': [128.7067

In [36]:
from datumaro.components.operations import compute_image_statistics, compute_ann_statistics, match_classes
print("data path")
print(tgt_path)
print()
print("data format")
print(tgt_dataset.format)
print()
print("image")
print(compute_image_statistics(tgt_dataset))
print()
print("ann")
print(compute_ann_statistics(tgt_dataset))
print()
print("class")
print(match_classes(src_dataset.categories(), tgt_dataset.categories())) # returns matched, src cls, det_cls

data path
/home/jeom/datum-ws/datumaro/data/mnist

data format
mnist

image
{'dataset': {'images count': 20, 'unique images count': 20, 'repeated images count': 0, 'repeated images': []}, 'subsets': {'test': {'images count': 10, 'image mean': [42.84390943877554, 42.84390943877554, 42.84390943877554], 'image std': [85.20780771019935, 85.20780771019935, 85.20780771019935]}, 'train': {'images count': 10, 'image mean': [29.43263711734697, 29.43263711734697, 29.43263711734697], 'image std': [72.02919302871828, 72.02919302871828, 72.02919302871828]}}}

ann
{'images count': 20, 'annotations count': 20, 'unannotated images count': 0, 'unannotated images': [], 'annotations by type': {'unknown': {'count': 0}, 'label': {'count': 20}, 'mask': {'count': 0}, 'points': {'count': 0}, 'polygon': {'count': 0}, 'polyline': {'count': 0}, 'bbox': {'count': 0}, 'caption': {'count': 0}, 'cuboid_3d': {'count': 0}, 'super_resolution_annotation': {'count': 0}, 'depth_annotation': {'count': 0}, 'ellipse': {'coun

In [63]:
from texttable import Texttable

# Data Path and Format
data_path_format_table = Texttable(max_width=0)
data_path_format_table.set_cols_width([40, 40])

data_path_format_table.header(["Data Info", "Value"])
data_path_format_table.add_row(["Data Path", "/home/jeom/datum-ws/datumaro/data/cifar10"])
data_path_format_table.add_row(["Data Format", "cifar"])
print(data_path_format_table.draw())

# Image Statistics
image_statistics_table = Texttable(max_width=10)
image_statistics_table.set_cols_width([20, 20, 20, 20])
image_statistics_table.header(["Subset", "Images Count", "Image Mean", "Image Std"])
image_statistics_table.add_row(["data_batch_5", 10, [113., 120., 127.], [62., 57., 55.]])
image_statistics_table.add_row(["data_batch_2", 10, [133., 140., 148.], [67., 64., 64.]])
image_statistics_table.add_row(["data_batch_4", 10, [117., 125., 130.], [66., 59., 61.]])
image_statistics_table.add_row(["data_batch_3", 10, [130., 136., 136.], [72., 71., 72.]])
image_statistics_table.add_row(["data_batch_1", 10, [128., 140., 140.], [72., 63., 64.]])
print(image_statistics_table.draw())

# Annotation Statistics
ann_statistics_table = Texttable()
ann_statistics_table.set_cols_width([20, 20, 20, 20])

ann_statistics_table.header(["Images Count", "Annotations Count", "Unannotated Images Count", "Label Count"])
ann_statistics_table.add_row([20, 20, 0, 20])
print(ann_statistics_table.draw())

# Class Match
class_match_table = Texttable()
class_match_table.add_row([20, 20, 20])

class_match_table.header(["Matched", "Source Classes", "Target Classes"])
class_match_table.add_row(["set()", "{'frog', 'ship', 'automobile', 'cat', 'horse', 'deer', 'truck', 'dog', 'bird', 'airplane'}", "{'7', '6', '3', '1', '9', '0', '8', '2', '4', '5'}"])
print(class_match_table.draw())


+------------------------------------------+------------------------------------------+
|                Data Info                 |                  Value                   |
| Data Path                                | /home/jeom/datum-                        |
|                                          | ws/datumaro/data/cifar10                 |
+------------------------------------------+------------------------------------------+
| Data Format                              | cifar                                    |
+------------------------------------------+------------------------------------------+
+----------------------+----------------------+----------------------+----------------------+
|        Subset        |     Images Count     |      Image Mean      |      Image Std       |
| data_batch_5         | 10                   | [113.0, 120.0,       | [62.0, 57.0, 55.0]   |
|                      |                      | 127.0]               |                      |
+-------

In [64]:
from datumaro.components.shift_analyzer import ShiftAnalyzer

In [65]:
shift = ShiftAnalyzer()

In [68]:
shift.compute_covariate_shift([src_dataset, tgt_dataset], method="emd")

0.032947380761906064

In [67]:
shift.compute_label_shift([src_dataset, tgt_dataset])

  _, _, pv = anderson_ksamp([labels[0], labels[1]])



0.75