In [1]:
from itertools import islice
import datacube
from odc import dscache

# Creating Database File

You can use cli tool `slurpy` to export a set of products to a file db. But if you need more control over what goes into the cache see below.

## Create new file db
Create new database, deleting any previous files that might have existed.

In [2]:
db_name = 'sample.db'
cache = dscache.create_cache(db_name, truncate=True)

## Get some datasets from Datacube

We are limiting to 200 for example purposes, also there is an outstanding [issue (# 542)](https://github.com/opendatacube/datacube-core/issues/542) with `find_datasets_lazy`, it's not actually "lazy" the whole SQL query is processed as a whole.

In [3]:
dc = datacube.Datacube(env='s2')
dss = dc.find_datasets_lazy(product='s2a_nrt_granule', limit=200)

## Write them to file db

Dataset cache provides a convenience method `.tee` that accepts dataset stream on input and generates same stream on output, but also saves datasets to the file db.

In [4]:
dss = cache.tee(dss)

Then you can just iterate over all datasets doing whatever other thing you needed to do.

In [5]:
%%time
for i, ds in enumerate(dss):
    if (i % 10) == 0:
        print('.', end='', flush=True)
print()
print('done')   

....................
done
CPU times: user 169 ms, sys: 5.22 ms, total: 174 ms
Wall time: 200 ms


**NOTE:**

`.tee` assumes that all datasets will be consumed, internally it breaks up dataset stream into transactions, it's not super clear what happens if you just stop half way through a transaction and never continue. Eventually transaction will be garbage collected and data written to disk, but in the meantime any writes will be blocked. So if you do exit early without consuming whole stream you should probably call `del dss` as soon as practical.

## Alternative way

In [6]:
%%time
dss = dc.find_datasets_lazy(product='s2b_nrt_granule', limit=200)
cache.bulk_save(dss) # blocks until all are written (in one single transaction)

CPU times: user 134 ms, sys: 597 µs, total: 135 ms
Wall time: 172 ms


In [7]:
cache.count, cache.products, cache.metadata

(400,
 {'s2a_nrt_granule': DatasetType(name='s2a_nrt_granule', id_=3),
  's2b_nrt_granule': DatasetType(name='s2b_nrt_granule', id_=4)},
 {'eo': MetadataType(name='eo', id_=1)})

In [8]:
cache.close()

# Bin datasets into Albers Tile

There is a cli tool `dstiler` that will go through all datasets in the file and bin them into various tiling regimes. Default regime is 100k side Albers tiles (same as on NCI). But there is also "native" for landsat scenes and "web".

In [9]:
!dstiler sample.db

Total bins: 421
[?25lSaving  [####################################]  100%[?25h


# Reading from file db

In [10]:
cache_ro = dscache.open_ro(db_name)
cache_ro.count, cache_ro.metadata, cache_ro.products

(400,
 {'eo': MetadataType(name='eo', id_=None)},
 {'s2a_nrt_granule': DatasetType(name='s2a_nrt_granule', id_=None),
  's2b_nrt_granule': DatasetType(name='s2b_nrt_granule', id_=None)})

## Stream datasets into RAM: `.get_all()`

In [11]:
dss = list(islice(cache_ro.get_all(), 10))
type(dss[0]), type(dss[0].type), type(dss[0].type.metadata_type)

(datacube.model.Dataset,
 datacube.model.DatasetType,
 datacube.model.MetadataType)

## Access individual dataset by UUID

In [12]:
cache_ro.get('0146ea8e-8462-4fca-a880-7f4a311441bf')

Dataset <id=0146ea8e-8462-4fca-a880-7f4a311441bf type=s2a_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2A_OPER_MSI_ARD_TL_EPAE_20190603T015756_A020605_T55KEB_N02.07/ARD-METADATA.yaml>

## Working with Groups

In [13]:
from odc.dscache.tools.tiling import parse_group_name

groups = cache_ro.groups()
len(groups), groups[:3]

(421, [('albers/-01_-23', 2), ('albers/-01_-24', 4), ('albers/-01_-25', 4)])

In [14]:
group_name, count = groups[1]
group_name, parse_group_name(group_name)

('albers/-01_-24', ((-1, -24), 'albers'))

### Read all UUIDs for a given group

In [15]:
cache_ro.get_group(group_name)

[UUID('3c0b41f9-67ca-4962-83a5-68bc103e960c'),
 UUID('6d07c802-18b5-4b84-ac19-80fea9b60866'),
 UUID('9be56b1f-bc9c-4346-85ca-f4cf6fe4985a'),
 UUID('d357d1c0-7531-44ac-85b3-a5cc1e5b9eb5')]

### Load Dataset documents for a given group

In [16]:
list(cache_ro.stream_group(group_name))

[Dataset <id=3c0b41f9-67ca-4962-83a5-68bc103e960c type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KHA_N02.07/ARD-METADATA.yaml>,
 Dataset <id=6d07c802-18b5-4b84-ac19-80fea9b60866 type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KGA_N02.07/ARD-METADATA.yaml>,
 Dataset <id=9be56b1f-bc9c-4346-85ca-f4cf6fe4985a type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KHB_N02.07/ARD-METADATA.yaml>,
 Dataset <id=d357d1c0-7531-44ac-85b3-a5cc1e5b9eb5 type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KGB_N02.07/ARD-METADATA.yaml>]

### Interfacing with Datacube's `GridWorkflow.load(..)`

There is a helper class that can construct `datacube.mode.Tile`.

In [17]:
from odc.dscache.tools import DcTileExtract

tiles = DcTileExtract(cache_ro)
tile_id,_ = parse_group_name(group_name)

tile = tiles(tile_id)

tile_id, tile

((-1, -24), Tile<sources=<xarray.DataArray (time: 4)>
 array([(Dataset <id=9be56b1f-bc9c-4346-85ca-f4cf6fe4985a type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KHB_N02.07/ARD-METADATA.yaml>,),
        (Dataset <id=d357d1c0-7531-44ac-85b3-a5cc1e5b9eb5 type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KGB_N02.07/ARD-METADATA.yaml>,),
        (Dataset <id=3c0b41f9-67ca-4962-83a5-68bc103e960c type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KHA_N02.07/ARD-METADATA.yaml>,),
        (Dataset <id=6d07c802-18b5-4b84-ac19-80fea9b60866 type=s2b_nrt_granule location=s3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/2019-06-03/S2B_OPER_MSI_ARD_TL_EPAE_20190603T040916_A011697_T52KGA_N02.07/ARD-METADATA.yaml>,)],
       dtype=obje

You can then pass `tile` object to `datacube.GridWorkflow.load(..)`.

-----------------------------------------------