In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')

In [3]:
import json
import os
from collections import Counter
import io
from random import sample

from tqdm import tqdm
from azure.cosmos.cosmos_client import CosmosClient
from azure.storage.blob import BlockBlobService
from PIL import Image

from visualization import visualization_utils, visualize_megadb
from data_management.annotations import annotation_constants

# Query for data

This notebook demonstrates the workflow to compile desired sequences of images by querying metadata and downloading the images stored in blob storage.

See the programs that Chris prepared here (internal): https://celads.visualstudio.com/CELA%20Data%20Science%20And%20Analytics/_git/DSnA.CameraTrap?path=%2FPython%2Fclientpr.py

`COSMOS_ENDPOINT` and `COSMOS_KEY` need to be environment variables.

In [6]:
# Initialize Cosmos DB client
url = os.environ['COSMOS_ENDPOINT']
key = os.environ['COSMOS_KEY']
client = CosmosClient(url, credential=key)

In [7]:
database = client.get_database_client('camera-trap')
container_datasets = database.get_container_client('datasets')
container_sequences = database.get_container_client('sequences')

## Store the `datasets` table
which records the location and access levels of each dataset.

In [8]:
%%time

query = '''SELECT * FROM datasets d'''

result_iterable = container_datasets.query_items(query=query, enable_cross_partition_query=True)

datasets = {i['dataset_name']:{k: v for k, v in i.items() if not k.startswith('_')} for i in iter(result_iterable)}

print('Length of results:', len(datasets))

Length of results: 18
CPU times: user 15 ms, sys: 3.1 ms, total: 18.1 ms
Wall time: 403 ms


## Select image entries

Example: top 1000 images from a given dataset with bounding boxes, selecting the file name and the dataset so we can plot the labels.

In [9]:
%%time

result_iterable = container_sequences.query_items(
    query='''
SELECT TOP @top_n seq
FROM sequences seq JOIN im IN seq.images 
WHERE ARRAY_LENGTH(im.bbox) > 0
''',
    parameters=[
        dict(name='@top_n', value=1000)
    ],
    partition_key='wps_190624'
)

CPU times: user 87 µs, sys: 1 µs, total: 88 µs
Wall time: 92.3 µs


In [10]:
results = [{k: v for k, v in r['seq'].items() if not k.startswith('_')} for r in iter(result_iterable)]

In [11]:
results[-1]

{'images': [{'file': 'ec2906a8-6e78-4351-9c7f-330b9a60f6ec.jpg',
   'image_id': 'ec2906a8-6e78-4351-9c7f-330b9a60f6ec',
   'bbox': [{'category': 'animal', 'bbox': [0.00125, 0.5612, 0.2025, 0.2113]},
    {'category': 'animal', 'bbox': [0.4989, 0.5437, 0.08856, 0.1343]},
    {'category': 'animal', 'bbox': [0.5021, 0.5223, 0.08235, 0.07782]},
    {'category': 'animal', 'bbox': [0.4405, 0.5373, 0.07755, 0.07768]},
    {'category': 'animal', 'bbox': [0.3837, 0.505, 0.0587, 0.07389]},
    {'category': 'animal', 'bbox': [0.3887, 0.5416, 0.05981, 0.1566]},
    {'category': 'animal', 'bbox': [0.4605, 0.549, 0.04557, 0.1481]},
    {'category': 'animal', 'bbox': [0.6711, 0.456, 0.03345, 0.1085]}],
   'class': ['__label_unavailable']}],
 'seq_id': 'dummy_3b960f28c6e54010903d3d02d1dfe695',
 'dataset': 'wps_190624',
 'class': ['human'],
 'id': 'd852d02a-cffd-4a66-bd80-4bfe2c9971ff'}

## Download sample images and visualize labels

For large batches, download using `multiprocessing.ThreadPool`.

In [None]:
sample_size = 2
sample_res = sample(results, sample_size)

for seq in sample_res:
    for im in seq['images']:
        if 'bbox' not in im or len(im['bbox']) == 0:
            continue
    
        dataset = seq['dataset']
        storage_account = datasets[dataset]['storage_account']
        storage_container = datasets[dataset]['container']
        storage_sas_key = datasets[dataset]['container_sas_key']
        path_prefix = datasets[dataset]['path_prefix']

        blob_service = visualize_megadb.get_blob_service(datasets, dataset)
        stream = io.BytesIO()
        _ = blob_service.get_blob_to_stream(storage_container, os.path.join(path_prefix, im['file']), stream)
        image = Image.open(stream)

        visualization_utils.render_megadb_bounding_boxes(im['bbox'], image)
        print('from dataset {}'.format(dataset))
        image