In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')

In [52]:
import json
import os
from collections import Counter
import io
from random import sample

from tqdm import tqdm
import azure.cosmos.cosmos_client as cosmos_client
from azure.storage.blob import BlockBlobService
from PIL import Image

from visualization import visualization_utils, visualize_megadb
from data_management.annotations import annotation_constants

# Query for data

This notebook demonstrates the workflow to compile desired sequences of images by querying metadata and downloading the images stored in blob storage.

`COSMOS_ENDPOINT` and `COSMOS_KEY` need to be environment variables.

In [6]:
# Cosmos DB config
config = {
    'ENDPOINT': os.environ.get('COSMOS_ENDPOINT'),
    'PRIMARYKEY': os.environ.get('COSMOS_KEY')
}

# Initialize the Cosmos client
client = cosmos_client.CosmosClient(url_connection=config['ENDPOINT'], auth={
                                    'masterKey': config['PRIMARYKEY']})

sequences_table = 'dbs/camera-trap/colls/sequences'  # database link + container link
datasets_table = 'dbs/camera-trap/colls/datasets'

options = {
    'enableCrossPartitionQuery': True
}

## Get the `datasets` table
which records the location and access levels of each dataset.

In [7]:
%%time

query = {'query': '''SELECT * FROM datasets d'''}

result_iterable = client.QueryItems(datasets_table, query, options)

datasets = {i['dataset_name']:{k: v for k, v in i.items() if not k.startswith('_')} for i in iter(result_iterable)}

print('Length of results:', len(datasets))

Length of results: 18
CPU times: user 18.3 ms, sys: 2.9 ms, total: 21.2 ms
Wall time: 450 ms


## Select image entries

Example: top 1000 images from a given dataset with bounding boxes, selecting the file name and the dataset so we can plot the labels.

In [57]:
%%time

query = {'query': '''
SELECT TOP 10000 seq
FROM sequences seq JOIN im IN seq.images 
WHERE ARRAY_LENGTH(im.bbox) > 0
'''}

options = {
    'enableCrossPartitionQuery': True
}

result_iterable = client.QueryItems(sequences_table, query, options=options)
# partition_key does not work here?

results = [item['seq'] for item in iter(result_iterable)]

print('Length of results:', len(results))

Length of results: 10000
CPU times: user 902 ms, sys: 79.1 ms, total: 981 ms
Wall time: 16.3 s


In [61]:
results[-1]

{'_attachments': 'attachments/',
 '_etag': '"1a00b5f9-0000-0500-0000-5dc5ee0e0000"',
 '_rid': 'WjB+AJ4IhThfWQAAAAAAAA==',
 '_self': 'dbs/WjB+AA==/colls/WjB+AJ4IhTg=/docs/WjB+AJ4IhThfWQAAAAAAAA==/',
 '_ts': 1573252623,
 'class': ['human'],
 'dataset': 'wps_190624',
 'id': '2da4e6e6-f576-4d3e-b549-c1d4d76d8fc4',
 'images': [{'bbox': [{'bbox': [0.8723, 0.2887, 0.1276, 0.6912],
     'category': 'person'}],
   'class': ['__label_unavailable'],
   'file': '377320b9-dbf4-45e3-b606-9d9c9d235fdf.jpg',
   'image_id': '377320b9-dbf4-45e3-b606-9d9c9d235fdf'}],
 'seq_id': 'dummy_ccc4c778a28941dd87fa9c79dba26c3f'}

## Download images and visualize labels

For large batches, download using `multiprocessing.ThreadPool`.

In [None]:
sample_size = 2
sample_res = sample(results, sample_size)

for seq in sample_res:
    for im in seq['images']:
        if 'bbox' not in im or len(im['bbox']) == 0:
            continue
    
        dataset = seq['dataset']
        storage_account = datasets[dataset]['storage_account']
        storage_container = datasets[dataset]['container']
        storage_sas_key = datasets[dataset]['container_sas_key']
        path_prefix = datasets[dataset]['path_prefix']

        blob_service = visualize_megadb.get_blob_service(datasets, dataset)
        stream = io.BytesIO()
        _ = blob_service.get_blob_to_stream(storage_container, os.path.join(path_prefix, im['file']), stream)
        image = Image.open(stream)

        visualization_utils.render_megadb_bounding_boxes(im['bbox'], image)
        print('from dataset {}'.format(dataset))
        image