In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [38]:
import json
import os
from collections import Counter
import io
from random import sample

from tqdm import tqdm
import azure.cosmos.cosmos_client as cosmos_client
from azure.storage.blob import BlockBlobService
from PIL import Image

from visualization import visualization_utils
from data_management.annotations import annotation_constants

# Query for data

This notebook demonstrates the workflow to compile desired images by querying metadata using the database instance and downloading the images stored in blob storage.

In [4]:
# Cosmos DB config
config = {
    'ENDPOINT': os.environ.get('COSMOS_ENDPOINT'),
    'PRIMARYKEY': os.environ.get('COSMOS_KEY')
}

# Initialize the Cosmos client
client = cosmos_client.CosmosClient(url_connection=config['ENDPOINT'], auth={
                                    'masterKey': config['PRIMARYKEY']})

container_link = 'dbs/camera-trap/colls/images'  # database link + container link

In [6]:
with open('datasets.json') as f:
    datasets_table = json.load(f)
    
# this is a json object with the account name as key, and the key to the account as value
with open('blob_account_keys.json') as f:
    blob_account_keys = json.load(f)

## Select image entries

Example: top 1000 images from a given dataset with bounding boxes, selecting the file name and the dataset so we can plot the labels.

In [47]:
%%time

dataset_name = 'rspb_gola'
# did not have species in many of these items
query = {'query': '''
SELECT TOP 1000 im.file_name, im.dataset, im.annotations.bbox, im.annotations.species
FROM images im
WHERE im.dataset = "{}" AND ARRAY_LENGTH(im.annotations.bbox) > 0
'''.format(dataset_name)}

options = {
    'enableCrossPartitionQuery': True
}

result_iterable = client.QueryItems(container_link, query, options, partition_key='idfg')
# if you want to restrict to one dataset, pass in partition_key=dataset

results = []
for item in iter(result_iterable):
    results.append(item)

print('Length of results:', len(results))

Length of results: 1000
CPU times: user 104 ms, sys: 12.7 ms, total: 117 ms
Wall time: 2.53 s


In [None]:
len(results)
results[77]

## Download images and visualize labels

For large batches, download using `multiprocessing.ThreadPool`.

In [None]:
sample_size = 2
sample_res = sample(results, sample_size)

for im in sample_res:
    dataset = im['dataset']
    storage_account = datasets_table[dataset]['storage_account']
    storage_container = datasets_table[dataset]['container']
    path_prefix = datasets_table[dataset]['path_prefix']

    print('Creating blob service')
    blob_service = BlockBlobService(account_name=storage_account, account_key=blob_account_keys[storage_account])
    print('Created')
    stream = io.BytesIO()
    _ = blob_service.get_blob_to_stream(storage_container, os.path.join(path_prefix, im['file_name']), stream)
    print('Downloaded')
    image = Image.open(stream)
    print('Opened')
    
    boxes = []
    classes = []
    
    for i in im['bbox']:
        boxes.append(i['bbox_rel'])
        classes.append(annotation_constants.bbox_category_name_to_id['animal'])

    visualization_utils.render_iMerit_boxes(boxes, classes, image)
    print('Visualized')
    image