# Creating Datasets for Faro Unit Testing

This notebook steps through the process of extracting a small set of data to use for `faro` unit testing.

In [None]:
# Which version of the Stack am I using?
!eups list -s | grep lsst_distrib

In [None]:
import glob
import os

import lsst.daf.butler as dafButler

## Preliminaries

Use the ci_hsc_gen3

(1) Set up [testdata_ci_hsc](https://github.com/lsst/testdata_ci_hsc) following README instructions.

(2) Set up [ci_hsc_gen3](https://github.com/lsst/ci_hsc_gen3) following README instructions.

(3) Run `faro`. First, set up the package.

```
cd repos/metric-pipeline-tasks
setup -k -r .
```

Run the single-band matched catalogs and metrics:

```
pipetask run -j 12 -b "$CI_HSC_GEN3_DIR"/DATA/butler.yaml --register-dataset-types -p pipelines/metrics_pipeline_matched.yaml --output kbechtol/matched -i HSC/runs/ci_hsc
```

Run the multi-band matched catalogs and metrics:

```
pipetask run -j 12 -b "$CI_HSC_GEN3_DIR"/DATA/butler.yaml --register-dataset-types -p pipelines/metrics_pipeline_matched_multi.yaml --output kbechtol/matched_multi -i HSC/runs/ci_hsc
```

## Extract Measurements

Access the `.yaml` containing metric results.

In [None]:
# This is a local version of ci_hsc_gen3
repo = '/home/kbechtol/DATA/ci_hsc_gen3/DATA/'

In [None]:
config = os.path.join(repo,'butler.yaml')
try: butler = dafButler.Butler(config=config)
except ValueError as e: print(e)

In [None]:
registry = butler.registry

In [None]:
for x in registry.queryCollections():
    print(x)

In [None]:
for x in registry.queryDatasetTypes(): 
    print(x)

In [None]:
path = '/home/kbechtol/DATA/ci_hsc_gen3/DATA/kbechtol/matched/20210127T041304Z'
yaml_files = glob.glob(path + "/**/*.yaml", recursive = True)
for file in yaml_files:
    #if '_0_70_' not in file:
    #    continue
    if 'metadata' in file:
        continue
    new_name = file.replace('_discrete_ci_hsc_kbechtol_matched_20210127T041304Z', '')
    print(os.path.basename(new_name))

In [None]:
def getMeasurementFilenames(butler, collections, tract):
    # Need to filter on tract / patch
    registry = butler.registry
    skymap = list(registry.queryDatasets('skyMap', collections=collections, findFirst=True))[0].dataId['skymap']
    paths = []
    outfiles = []
    for x in registry.queryDatasetTypes(): 
        if x.storageClass.name == 'MetricValue':
            dataid = {'tract': tract, 'skymap': skymap}
            refs = list(registry.queryDatasets(x.name, dataId=dataid, collections=collections))
            if len(refs) == 0:
                continue
            measurement = butler.get(refs[0], collections=collections)
            uri = butler.getURI(x.name, refs[0].dataId, collections=collections)
            outfile = '%s%s%s%s'%(measurement.metric_name, 
                                  '_expected',
                                  os.path.basename(uri.path).split('_discrete')[0].split('HSC')[1],
                                  os.path.splitext(uri.path)[1])
            paths.append(uri.path)
            outfiles.append(outfile)
            
    return list(zip(paths, outfiles))

In [None]:
collections = 'kbechtol/matched'
metric_results_single_band = getMeasurementFilenames(butler, collections, 0)

In [None]:
metric_results_single_band[0]

In [None]:
collections = 'kbechtol/matched_multi'
metric_results_multi_band = getMeasurementFilenames(butler, collections, 0)

## Extract Matched Catalogs

In [None]:
def getMatchedCatalogFilenames(butler, collections, datasettype, tract):
    registry = butler.registry
    skymap = list(registry.queryDatasets('skyMap', collections=collections, findFirst=True))[0].dataId['skymap']
    dataid = {'tract': tract, 'skymap': skymap}
    refs = list(registry.queryDatasets(datasettype, 
                                       dataId=dataid, 
                                       collections=collections))
    uri = butler.getURI(datasettype, refs[0].dataId, collections=collections)
    outfile = os.path.basename(uri.path).split('_discrete')[0] + os.path.splitext(uri.path)[1]
    return (uri.path, outfile)

In [None]:
collections = 'kbechtol/matched'
matched_catalog_single_band = getMatchedCatalogFilenames(butler, collections, 'matchedCatalogTract', 0)

In [None]:
collections = 'kbechtol/matched_multi'
matched_catalog_multi_band = getMatchedCatalogFilenames(butler, collections, 'matchedCatalogMulti', 0)