In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

We are using `azure-cosmos` version `4.0.0b5` or later, which has a very different API than older versions.

In [35]:
import json
import os
from collections import Counter

from tqdm import tqdm
from azure.cosmos.cosmos_client import CosmosClient
from azure.cosmos.partition_key import PartitionKey

# Useful queries

Example queries for the MegaDB's `sequences` table. MegaDB is a NoSQL database on Azure Cosmos DB.

Cosmos DB Python SDK (pre-release) documentation: https://azuresdkdocs.blob.core.windows.net/$web/python/azure-cosmos/4.0.0b5/index.html 

Subquery for nested objects: https://docs.microsoft.com/en-us/azure/cosmos-db/sql-query-subquery

## Connect to the Cosmos DB instance

`COSMOS_ENDPOINT` and `COSMOS_KEY` need to be environment variables. 

In [7]:
# Initialize Cosmos DB client
url = os.environ['COSMOS_ENDPOINT']
key = os.environ['COSMOS_KEY']
client = CosmosClient(url, credential=key)

database = client.get_database_client('camera-trap')
container_datasets = database.get_container_client('datasets')
container_sequences = database.get_container_client('sequences')

### Get the `datasets` table

which records the location and access levels of each dataset.

In [8]:
%%time

query = '''SELECT * FROM datasets d'''

result_iterable = container_datasets.query_items(query=query, enable_cross_partition_query=True)

datasets = {i['dataset_name']:{k: v for k, v in i.items() if not k.startswith('_')} for i in iter(result_iterable)}

print('Length of results:', len(datasets))

Length of results: 18
CPU times: user 15.7 ms, sys: 2.95 ms, total: 18.7 ms
Wall time: 425 ms


#### List the public datasets

In [9]:
public_datasets = [d['dataset_name'] for d in datasets.values() if 'public' in d['access']]
public_datasets

['caltech', 'wcs', 'nacti', 'bellevue_190602']

## Examples

In the examples, we limit the selection to a few entries using the `TOP` keyword. When using the DB to create datasets, delete the TOP keyword and arg.

### What datasets are there that have sequences data?

In [13]:
%%time

result_iterable = container_sequences.query_items(
    query='''
SELECT DISTINCT seq.dataset
FROM sequences seq
''',
    enable_cross_partition_query=True
)

results = [item['dataset'] for item in iter(result_iterable)]

print('Length of results:', len(results))

Length of results: 3
CPU times: user 26.7 ms, sys: 4.1 ms, total: 30.8 ms
Wall time: 11.9 s


In [14]:
results

['peaceparks_201908_humans', 'wps_190624', 'zsl_borneo']

### Image entries in a dataset with class "empty" - demonstrating JOIN
Can use `partition_key` for this query without needing joins

Refer to https://docs.microsoft.com/en-us/azure/cosmos-db/sql-query-join - query is constructed as follows (thanks to Chris Ritchie):
1. Iterate through each document (seq) (query scoped to a single partition otherwise you could add an additional WHERE clause for seq.dataset = 'zsl_borneo'): `FROM sequences seq`
2. For each document (seq) expand each child element in the images array (im): `im IN seq.images`
3. Apply a cross product with the root of the item (seq) with each child element (im) the second step flattened
4. Project each child element (im): `JOIN`

In [15]:
%%time

dataset_name = 'zsl_borneo'

query = '''
SELECT TOP 10 im.file, im.class, seq.seq_id
FROM sequences seq JOIN im IN seq.images 
WHERE seq.dataset = "{}" 
    AND ARRAY_LENGTH(im.class) > 0 
    AND ARRAY_CONTAINS(im.class, "empty")
'''.format(dataset_name)

# WHERE ARRAY_LENGTH(im.class) > 0 AND ARRAY_CONTAINS(im.class, "empty")

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

results = [item for item in iter(result_iterable)]

print('Length of results:', len(results))

Length of results: 10
CPU times: user 16.6 ms, sys: 2.41 ms, total: 19 ms
Wall time: 323 ms


### All images with bounding box annotation

In [45]:
%%time

query = '''
SELECT TOP 10 im.bbox, im.file
FROM sequences seq JOIN im IN seq.images 
WHERE ARRAY_LENGTH(im.bbox) > 0
'''

result_iterable = container_sequences.query_items(query=query, partition_key='wps_190624')

results = [item for item in iter(result_iterable)]

print('Length of results:', len(results))

Length of results: 10
CPU times: user 5.19 ms, sys: 2.11 ms, total: 7.29 ms
Wall time: 61 ms


### All images with the specified species at the image level

In [26]:
%%time

species_requested = 'horse-tailed squirrel'

query ='''
SELECT TOP 10 im.class, im.file
FROM im IN sequences.images 
WHERE ARRAY_LENGTH(im.class) > 0 AND ARRAY_CONTAINS(im.class, "{}")
'''.format(species_requested)

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

results = [item for item in iter(result_iterable)]

print('Length of results:', len(results))

Length of results: 10
CPU times: user 10.8 ms, sys: 2.63 ms, total: 13.4 ms
Wall time: 177 ms


### Species count where the label is at the sequence level

In [27]:
%%time

query = '''
SELECT TOP 100 seq.class
FROM sequences seq
WHERE ARRAY_LENGTH(seq.class) > 0
'''

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

species = Counter()
for item in iter(result_iterable):
    res = item['class']
    species.update(res)

CPU times: user 16.5 ms, sys: 3.14 ms, total: 19.7 ms
Wall time: 283 ms


In [28]:
species

Counter({'human': 100})

### Species count where the label is at the image level

In [29]:
%%time

query = '''
SELECT TOP 100000 im.class
FROM im IN sequences.images 
WHERE ARRAY_LENGTH(im.class) > 0
'''

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

species = Counter()
for item in iter(result_iterable):
    res = item['class']
    species.update(res)

CPU times: user 5.17 s, sys: 573 ms, total: 5.75 s
Wall time: 1min 55s


We can increase the throughput of the database when doing large queries to get response faster.

In [30]:
species

Counter({'__label_unavailable': 88489,
         'red muntjac': 1528,
         'empty': 3473,
         'horse-tailed squirrel': 32,
         'greater mouse-deer': 2711,
         'malay porcupine': 116,
         'southern pig-tailed macaque': 667,
         'human': 810,
         'banded civet': 307,
         'yellow muntjac': 478,
         'thick-spined porcupine': 39,
         'spiny rat': 264,
         'great argus': 286,
         'bearded pig': 271,
         "low's squirrel": 22,
         'sambar deer': 33,
         'lesser mouse-deer': 135,
         'emerald dove': 10,
         'black-capped babbler': 5,
         'malay civet': 94,
         'long-tailed macaque': 23,
         'long-tailed porcupine': 74,
         'blue-headed pitta': 10,
         'yellow-throated marten': 2,
         "bulwer's pheasant": 50,
         'short-tailed mongoose': 40,
         'bornean banded pitta': 9,
         'bornean ground-babbler': 10,
         'banded linsang': 2,
         'plain treeshrew': 3,
    

### Total number of sequence entries in database

In [31]:
%%time

query = '''
SELECT VALUE COUNT(1)
FROM seq
'''

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

for seq_count in iter(result_iterable):
    print(seq_count)

146526
CPU times: user 13 ms, sys: 2.45 ms, total: 15.4 ms
Wall time: 175 ms


### Total number of images in all sequences

In [32]:
%%time

query = '''
SELECT VALUE COUNT(1)
FROM im IN sequences.images 
'''

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

for im_count in iter(result_iterable):
    print(im_count)

596773
CPU times: user 37.8 ms, sys: 5.76 ms, total: 43.6 ms
Wall time: 30.3 s


### List last inserted sequences based on insertion timestap

In [33]:
%%time

query = '''
SELECT TOP 10 seq.dataset, seq._ts, seq.seq_id
FROM sequences seq
ORDER BY seq._ts DESC
'''

result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)

results = [item for item in iter(result_iterable)]

CPU times: user 12 ms, sys: 3.14 ms, total: 15.1 ms
Wall time: 176 ms


In [34]:
results

[{'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50869'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50870'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50871'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50872'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50873'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50874'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50875'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50876'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50877'},
 {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50878'}]