# Creating Datasets for Faro Unit Testing

This notebook steps through the process of extracting a small set of data to use for `faro` unit testing.

In [None]:
# Which version of the Stack am I using?
!eups list -s | grep lsst_distrib

In [None]:
import glob
import os

import lsst.daf.butler as dafButler

## Preliminaries

Use the ci_hsc_gen3

(1) Set up [testdata_ci_hsc](https://github.com/lsst/testdata_ci_hsc) following README instructions.

(2) Set up [ci_hsc_gen3](https://github.com/lsst/ci_hsc_gen3) following README instructions.

(3) Run `faro`. First, set up the package.

```
cd repos/metric-pipeline-tasks
setup -k -r .
```

Run the single-band matched catalogs and metrics:

```
pipetask run -j 12 -b "$CI_HSC_GEN3_DIR"/DATA/butler.yaml --register-dataset-types -p pipelines/metrics_pipeline_matched.yaml --output kbechtol/matched -i HSC/runs/ci_hsc
```

Run the multi-band matched catalogs and metrics:

```
pipetask run -j 12 -b "$CI_HSC_GEN3_DIR"/DATA/butler.yaml --register-dataset-types -p pipelines/metrics_pipeline_matched_multi.yaml --output kbechtol/matched_multi -i HSC/runs/ci_hsc
```

## Extract Measurements

Access the `.yaml` containing metric results.

In [None]:
# This is a local version of ci_hsc_gen3
repo = '/home/kbechtol/DATA/ci_hsc_gen3/DATA/'

In [None]:
#repo = '/project/hsc/gen3repo/rc2v21_0_0_rc1_ssw48/'
#repo = '/project/krughoff/repos/ci_hsc_gen3/DATA/'
config = os.path.join(repo,'butler.yaml')
try: butler = dafButler.Butler(config=config)
except ValueError as e: print(e)

In [None]:
butler

In [None]:
registry = butler.registry

In [None]:
for x in registry.queryCollections():
    print(x)

In [None]:
for x in registry.queryDatasetTypes(): 
    print(x)

In [None]:
path = '/home/kbechtol/DATA/ci_hsc_gen3/DATA/kbechtol/matched/20210127T041304Z'
yaml_files = glob.glob(path + "/**/*.yaml", recursive = True)
for file in yaml_files:
    #if '_0_70_' not in file:
    #    continue
    if 'metadata' in file:
        continue
    new_name = file.replace('_discrete_ci_hsc_kbechtol_matched_20210127T041304Z', '')
    print(os.path.basename(new_name))

In [None]:
help(butler)

In [None]:
def getMeasurementFilenames(butler, collections):
    registry = butler.registry
    paths = []
    outfiles = []
    for x in registry.queryDatasetTypes(): 
        if x.storageClass.name == 'MetricValue':
            #print(x.name, x.dimensions, 'patch' in x.dimensions)
            refs = list(registry.queryDatasets(x.name, collections=collections))
            if len(refs) == 0:
                continue
            #print(len(refs))
            measurement = butler.get(refs[0], collections=collections)
            #print(measurement.metric_name)
            uri = butler.getURI(x.name, refs[0].dataId, collections=collections)
            print(uri.path)
            #outfile = os.path.basename(uri.path)
            outfile = '%s%s%s%s'%(measurement.metric_name, 
                                  '_expected',
                                  os.path.basename(uri.path).split('_discrete')[0].split('HSC')[1],
                                  os.path.splitext(uri.path)[1])
            print(outfile)
            print('\n')
            paths.append(uri.path)
            outfiles.append(outfile)
            
    return list(zip(paths, outfiles))

In [None]:
collections = 'kbechtol/matched'
metric_results_single_band = getMeasurementFilenames(butler, collections)

In [None]:
results[0][0]

In [None]:
collections = 'kbechtol/matched_multi'
metric_results_multi_band = getMeasurementFilenames(butler, collections)

In [None]:
collections = 'kbechtol/matched'
#src_refs = list(registry.queryDatasets('src', collections='matchedTest', band='r'))
for x in registry.queryDatasetTypes(): 
    if x.storageClass.name == 'MetricValue':
        print(x.name, x.dimensions, 'patch' in x.dimensions)
        refs = list(registry.queryDatasets(x.name, collections=collections))
        print(len(refs))
        measurement = butler.get(refs[0], collections=collections)
        print(measurement.metric_name)
        uri = butler.getURI(x.name, refs[0].dataId, collections=collections)
        print(uri.path)
        #outfile = os.path.basename(uri.path)
        outfile = '%s%s%s%s'%(measurement.metric_name, 
                              '_expected',
                              os.path.basename(uri.path).split('_discrete')[0].split('HSC')[1],
                              os.path.splitext(uri.path)[1])
        print(outfile)
        print('\n')
    #if 'metricvalue' in x.name:
    #    print(x.name, type(x.storageClass.name))

In [None]:
list(registry.queryDatasets('metricvalue_validate_drp_AB1_design', collections=collections))

In [None]:
#dir(uri)
#measurement.metric_name
print(measurement.metric_name)
print(x.name)
'%s%s'%(measurement.metric_name, os.path.basename(uri.path).split('_discrete')[0].split('HSC')[1])

In [None]:
x.storageClass

## Extract Matched Catalogs

In [None]:
def getMatchedCatalogFilenames(butler, collections, datasettype, tract):
    skymap = list(registry.queryDatasets('skyMap', collections=collections, findFirst=True))[0].dataId['skymap']
    dataid = {'tract': tract, 'skymap': skymap}
    refs = list(registry.queryDatasets(datasettype, 
                                       dataId=dataid,
                                       #where="tract = 0 AND skymap='discrete/ci_hsc'", 
                                       collections=collections))
    #measurement = butler.get(refs[0], collections=collections)
    #print(measurement.metric_name)
    #print(len(refs))
    uri = butler.getURI(datasettype, refs[0].dataId, collections=collections)
    print(uri.path)
    #outfile = os.path.basename(uri.path)
    #outfile = '%s%s'%('matchedCatalog',
    #                  os.path.basename(uri.path).split('_discrete')[0])
    outfile = os.path.basename(uri.path).split('_discrete')[0] + os.path.splitext(uri.path)[1]
    #os.path.basename(uri.path).split('_discrete')[0].split('HSC')[1],
    print(outfile)
    print('\n')
    return (uri.path, outfile)

In [None]:
#list(registry.queryDatasets('skyMap', collections='kbechtol/matched'))[0].dataId['skymap']

In [None]:
#list(registry.queryDatasets('matchedCatalogTract', collections='kbechtol/matched'))

In [None]:
collections = 'kbechtol/matched'
results = getMatchedCatalogFilenames(butler, collections, 'matchedCatalogTract', 0)

In [None]:
collections = 'kbechtol/matched_multi'
getMatchedCatalogFilenames(butler, collections, 'matchedCatalogMulti', 0)

In [None]:
collections = 'kbechtol/matched'
am1_refs = list(registry.queryDatasets('metricvalue_validate_drp_AM1', collections=collections))
am1_measurement = butler.get(am1_refs[0], collections=collections)
print(am1_measurement.metric_name)
#print(am1_measurement.identifier)
dir(am1_measurement)

In [None]:
collections = 'kbechtol/matched'
matched_catalog_refs = list(registry.queryDatasets('matchedCatalogTract', collections=collections))

In [None]:
for ref in matched_catalog_refs:
    print(ref.dataId)
    print(butler.getURI('matchedCatalogTract', ref.dataId, collections=collections))
    os.path.isfile()

In [None]:
uri = butler.getURI('matchedCatalogTract', ref.dataId, collections=collections)
uri.path

In [None]:
assert os.path.isfile(uri.path)
assert os.path.exists(uri.path)

In [None]:
dir(uri)