In [2]:
import sys
sys.path.append('/Users/siyuyang/source/repos/GitHub_MSFT/CameraTraps')

In [3]:
import json
import os

from tqdm import tqdm
import azure.cosmos.cosmos_client as cosmos_client

from data_management.cct_json_utils import IndexedJsonDb

# Cosmos database playground

We would like to centralize all COCO Camera Trap (CCT) format json databases containing image metadata and species/bounding box annotations into an instance of a Cosmos DB for management and querying. 

This notebook demonstrates how one CCT database can be inserted into our instance of the Cosmos DB and how image metadata can be listed by queries based on annotation presence and species.

`dataset` is the attribute of each item used for partitioning; `image_id` (not `id`, which is a reserved word) is the attribute that needs to be unique within a partition.

### TODO
- Investigate batch insert https://blog.siliconvalve.com/2018/11/19/bulk-insert-entities-into-cosmos-db-using-python/
- Migrate all CCT databases there
- Create an additional table for dataset metadata

Documentation: https://docs.microsoft.com/en-us/azure/cosmos-db/create-sql-api-python

## Prepare an embedded version of a CCT database
Embed all annotation entries in image entries

In [None]:
# Cosmos DB config
config = {
    'ENDPOINT': os.environ.get('COSMOS_ENDPOINT'),
    'PRIMARYKEY': os.environ.get('COSMOS_KEY'),
    'DATABASE': 'camera-trap',
    'CONTAINER': 'images'
}

# Initialize the Cosmos client
client = cosmos_client.CosmosClient(url_connection=config['ENDPOINT'], auth={
                                    'masterKey': config['PRIMARYKEY']})

In [7]:
dataset_name = 'idfg'

### image DB

In [24]:
cct_json_path = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/Camera_trap/Databases/databases_201904/idfg/idfg_20190409.json'
cct_json_db = IndexedJsonDb(cct_json_path)
items = cct_json_db.image_id_to_image

In [25]:
# species annotation
num_images_with_more_than_1_species = 0
for image_id, annotations in cct_json_db.image_id_to_annotations.items():
    items[image_id]['annotations'] = {
        'species': []
    }
    if len(annotations) > 1:
        num_images_with_more_than_1_species += 1
    for anno in annotations:
        cat_name = cct_json_db.cat_id_to_name[anno['category_id']]
        items[image_id]['annotations']['species'].append(cat_name)
print('Number of images with more than 1 species: ', num_images_with_more_than_1_species)

Number of images with more than 1 species:  737


In [26]:
len(items)

678870

In [27]:
for i, item in items.items():
    print(item)
    break

{'file_name': 'Beaverhead_elk/AM34/Trip 1/100RECNX/2016-01-12 08-00-00 T.JPG', 'id': 'Beaverhead_elk/AM34/Trip 1/100RECNX/2016-01-12 08-00-00 T.JPG', 'annotations': {'species': ['empty']}, 'datetime': '12-Jan-2016 08:00:00', 'location': 'Beaverhead_elk+AM34'}


### bbox DB

In [28]:
cct_bbox_json_path = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/Camera_trap/Databases/databases_201904/idfg/idfg_bboxes_20190409.json'
cct_bbox_json_db = IndexedJsonDb(cct_bbox_json_path)

In [30]:
# add any images that are not in the image DB
# also add any fields in the image object that are not present already
num_added = 0
num_amended = 0
for image_id, image_obj in cct_bbox_json_db.image_id_to_image.items():
    if image_id not in items:
        items[image_id] = image_obj
        num_added += 1
        
    for field_name, val in image_obj.items():
        if field_name not in items[image_id]:
            items[image_id][field_name] = val
            num_amended += 1
            
print('Number of images added: ', num_added)
print('Number of images amended: ', num_amended)

Number of images added:  15000
Number of images amended:  83242


In [31]:
len(items)

693870

In [36]:
def round_coord(coord):
    return round(coord, 4)

# add bbox to the annotations field
num_more_than_1_bbox = 0

for image_id, bbox_annotations in cct_bbox_json_db.image_id_to_annotations.items():
    
    # for any newly added images
    if 'annotations' not in items[image_id]:
        items[image_id]['annotations'] = {}
    
    items[image_id]['annotations']['bbox'] = []
    
    if len(bbox_annotations) > 1:
        num_more_than_1_bbox += 1
    
    for bbox_anno in bbox_annotations:  
        item_bbox = {
            'category': cct_bbox_json_db.cat_id_to_name[bbox_anno['category_id']],
            'bbox_abs': bbox_anno['bbox'],
        }
        
        if 'width' in items[image_id]:
            image_w = items[image_id]['width']
            image_h = items[image_id]['height']
            x, y, w, h = bbox_anno['bbox']
            item_bbox['bbox_rel'] = [
                round_coord(x / image_w),
                round_coord(y / image_h),
                round_coord(w / image_w),
                round_coord(h / image_h)
            ]
            
        items[image_id]['annotations']['bbox'].append(item_bbox)
    
print('Number of images with more than one bounding box: ', num_more_than_1_bbox)

Number of images with more than one bounding box:  21255


In [39]:
items_list = list(items.values())

In [84]:
# get rid of any trailing '.JPG'
# insert the 'dataset' attribute used as the partition key
# replace illegal chars (for Cosmos DB) in the id field of the image
# rename the id field (reserved) to image_id
illegal_char_map = {
    '/': '~',
    '\\': '~',
    '?': '__qm__',
    '#': '__pound__'
}


for i in items_list:
    i['id'] = i['id'].split('.JPG')[0].split('.jpg')[0]
    
    for illegal, replacement in illegal_char_map.items():
        i['id'] = i['id'].replace(illegal, replacement)
        
    i['dataset'] = dataset_name
    
    i['image_id'] = i['id']
    del i['id']

In [85]:
num_both_species_bbox = 0

for item in items_list:
    if 'annotations' in item:
        if 'species' in item['annotations'] and 'bbox' in item['annotations']:
            num_both_species_bbox += 1
print('Number of images with both species and bbox annotations: ', num_both_species_bbox)

Number of images with both species and bbox annotations:  39312


In [86]:
items_to_insert = items_list[:100]

In [87]:
len(items_to_insert)

100

## Insert to Cosmos DB

In [88]:
items_to_insert[0]

{'annotations': {'species': ['empty']},
 'dataset': 'idfg',
 'datetime': '12-Jan-2016 08:00:00',
 'file_name': 'Beaverhead_elk/AM34/Trip 1/100RECNX/2016-01-12 08-00-00 T.JPG',
 'image_id': 'Beaverhead_elk~AM34~Trip 1~100RECNX~2016-01-12 08-00-00 T',
 'location': 'Beaverhead_elk+AM34'}

In [93]:
container = 'dbs/camera-trap/colls/images'

In [89]:
item1 = client.CreateItem('dbs/camera-trap/colls/images', items_to_insert[0])

In [90]:
item1 = client.CreateItem('dbs/camera-trap/colls/images', items_to_insert[1])

In [91]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/cosmos_db/trial_100.json', 'w') as f:
    json.dump(items_to_insert, f, indent=1)

In [110]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/cosmos_db/idfg_20190409.json', 'w') as f:
    json.dump(items_list, f, indent=1)

You can upload a json of 2MB at a time to the database via Azure Portal.

## Queries

### All images with bounding box annotation

In [111]:
query = {'query': '''
SELECT im
FROM images im
WHERE ARRAY_LENGTH(im.annotations.bbox) > 0
'''}

options = {}
options['enableCrossPartitionQuery'] = True
options['maxItemCount'] = 100

result_iterable = client.QueryItems(container, query, options)
results = []
for item in iter(result_iterable):
    res = item['im']
    results.append(res)

### All images with the specified species

In [108]:
species_requested = 'elk'

query = {'query': '''
SELECT im
FROM images im
WHERE ARRAY_LENGTH(im.annotations.species) > 0 AND ARRAY_CONTAINS(im.annotations.species, "{}")
'''.format(species_requested)}

options = {}
options['enableCrossPartitionQuery'] = True
options['maxItemCount'] = 100

result_iterable = client.QueryItems(container, query, options)
results = []
for item in iter(result_iterable):
    res = item['im']
    results.append(res)