In [1]:
import uuid
from uuid import UUID
from typing import Sequence

# Some random UUID to be ODC namespace
ODC_NS = UUID('6f34c6f4-13d6-43c0-8e4e-42b6c13203af')

def odc_uuid(algorithm: str, 
             algorithm_version: str, 
             sources: Sequence[UUID], 
             deployment_id: str = '',
             **other_tags
            ):
    """ Generate deterministic UUID for a derived Dataset
    
    :param algorithm: Name of the algorithm
    :param version: Version string of the algorithm
    :param sources: Sequence of input Dataset UUIDs
    :param deployment_id: Some sort of identifier for installation that performs 
                          the run, for example Docker image hash, or dea module version on NCI.
    :param **other_tags: Any other identifiers necessary to uniquely identify dataset
    """
    tags = ['{key}={value}'.format(key=k, value=str(v)) 
            for k,v in other_tags.items()]
    
    ss = ([str(algorithm), str(algorithm_version), str(deployment_id)] 
          + sorted(tags)
          + [str(u) for u in sorted(sources)])
    
    srcs_hashes = '\n'.join(s.lower() for s in ss)
    return uuid.uuid5(ODC_NS, srcs_hashes)

In [2]:
# some random sources IDs
srcs = [
    UUID('e7b2071a-ad2b-47b6-af95-456d851d126b'),
    UUID('d7ea15de-30ce-495b-8fec-aea63f5692a1'),
    UUID('34f24ff8-6f55-4e10-a15b-a7064ffd6fff'),
    UUID('108c013a-ac60-420a-b1aa-23d1d8672572'),
    UUID('8ab5ba88-7fa8-4e35-855f-9160eb18bc49')]

### Same sources different algorithm

In [3]:
(odc_uuid('wofs', '1.3.1', srcs), 
 odc_uuid('fc', '2.3', srcs))

(UUID('9ae617f5-1c2b-5b2f-9e58-20f724b877c0'),
 UUID('5cea3404-219a-5f68-bd77-3ea55e88f25c'))

### Re-order sources should still produce same UUIDs as cell above

Also case insensitive.

In [4]:
(odc_uuid('WOfS', '1.3.1', srcs[::-1]), 
 odc_uuid('FC', '2.3', srcs[::-1]))

(UUID('9ae617f5-1c2b-5b2f-9e58-20f724b877c0'),
 UUID('5cea3404-219a-5f68-bd77-3ea55e88f25c'))

### Use fewer sources -- different UUIDs

In [5]:
(odc_uuid('wofs', '1.3.1', srcs[:2]), 
 odc_uuid('fc', '2.3', srcs[:2]))

(UUID('e8e9290c-7db5-5a3c-b599-a5346a32dbba'),
 UUID('6d911349-a359-5cb7-bf8d-b1c6f2cc57d4'))

### Ingest needs extra tags

Ingest splits one dataset into many, hence needs extra parameters beyond lineage and algorithm to fully disambiguate output datasets.

In [6]:
(odc_uuid('ingest', '1.0.0', srcs[:1], cell=(-3, 4)),
 odc_uuid('ingest', '1.0.0', srcs[:1], cell=(-3, 5)))

(UUID('00bfa5d0-359d-5eeb-be96-160aa7157f8c'),
 UUID('fe7d1f80-c9e9-5066-8900-07bdcf98f9d2'))