In [1]:
from collections import defaultdict
from itertools import chain
from pathlib import Path
import shutil

from cytoolz import groupby
from cytoolz.curried import get
from gPhoton.pretty import print_stats
from killscreen.monitors import Netstat, Stopwatch
import pandas as pd
import pyarrow as pa
import pyarrow.csv
from pyarrow import parquet

from s3_fuse.mount_s3 import mount_bucket
from s3_fuse.ps1_utils import prune_ps1_catalog, get_ps1_cutouts
from s3_fuse.utilz import make_loaders, sample_table

In [2]:
# 'configuration'

BUCKET = 'nishapur'
S3_ROOT = '/mnt/s3'

# desired cutout side length in degrees
CUTOUT_SIDE_LENGTH = 50 / 3600

# how many (randomly-selected) targets would we like cutouts for? 
TARGET_COUNT = 10
# which PS1 bands are we looking at? (currently only g and z are staged.)
PS1_BANDS = ("g", "z")
# shall we do GALEX stuff?
DO_GALEX_STUFF = False

# select loaders -- options are "astropy", "fitsio", "greedy_astropy", "greedy_fitsio"
# NOTE: because all the files this particular notebook is looking
# at are RICE-compressed, there is unlikely to be much difference
# between astropy and greedy_astropy -- astropy does not support
# loading individual tiles from a a tile-compressed FITS file.
LOADERS = make_loaders("greedy_fitsio", "fitsio")

In [3]:
# catalog of PS1 extragalactic extended objects, including explicit 
# assignments to PS1 stack image projection / sky cells and GALEX 
# eclipse numbers
catalog_fn = "ps1_extragalactic_skycells_eclipses.parquet"
if not Path(catalog_fn).exists():
    shutil.copy(
        Path(S3_ROOT, "ps1/metadata", catalog_fn),
        Path(catalog_fn)
    )
catalog = parquet.read_table(catalog_fn)

# for this demo, we only staged a subset of those PS1 stack images 
# (all of them at all 5 bands would be > 80 TB). this is a list of 
# the (randomly selected) projection and sky cells we staged.
test_cell_fn = "ps1_extragalactic_skycells_eclipses_1k_cell_subset.csv"
arbitrary_test_cells = (
    pa.csv
    .read_csv(Path(S3_ROOT, "ps1/metadata", test_cell_fn))
    .cast(pa.schema([("proj_cell", pa.uint16()), ("sky_cell", pa.uint8())]))
)
small_catalog = prune_ps1_catalog(catalog, arbitrary_test_cells)

# and a little pruning on GALEX: this is a table of actually-existing MIS-like 
# images by eclipse number, excluding eclipses with data currently flagged as bad
extant_mislike = pd.read_csv(Path(S3_ROOT, "extant_mislike_eclipses.csv"))['0']

In [6]:
# how many objects shall we collect slices for? (785510 are available in this test set)
TARGET_COUNT = 80
# optional parameter -- restrict the total number of PS1 source cells to test the 
# performance effects of denser sampling (1000 total PS1 cells are available in this test set).
# note that the number of actual images accessed is a factor of both the number of cells
# and the number of bands under consideration.
# if GALEX fusion is taking place, this will also indirectly
# restrict the number of GALEX images.
MAX_CELL_COUNT = 5
if MAX_CELL_COUNT is not None:
    test_catalog = prune_ps1_catalog(
        small_catalog, sample_table(arbitrary_test_cells, k=MAX_CELL_COUNT)
    )
else:
    test_catalog = small_catalog
targets = sample_table(test_catalog, k=TARGET_COUNT).to_pylist()
ps1_stacks = set(map(get(['proj_cell', 'sky_cell']), targets))
galex_eclipses = {
    e for e in tuple(chain.from_iterable(map(get('galex'), targets)))
    if e in extant_mislike.values
}

In [7]:
ps1_groups = groupby(get(['proj_cell', 'sky_cell']), targets)
ps1_cutouts = {}
log = {}
for loader_name, loader in LOADERS.items():
    # remount bucket to avoid 'cheating'
    mount_bucket(
        backend="goofys", remount=True, mount_path=S3_ROOT, bucket=BUCKET
    )
    outer_stat = print_stats(Stopwatch(silent=True), Netstat())
    for stack in ps1_stacks:
        image_targets = ps1_groups[stack]
        cutouts, _, stack_log = get_ps1_cutouts(
            image_targets, 
            loader,
            PS1_BANDS, 
            CUTOUT_SIDE_LENGTH, 
            f"{S3_ROOT}/ps1",
            verbose=1
        )
        ps1_cutouts |= cutouts
        log |= stack_log
    print(f"acquired PS1 cutouts,{outer_stat()}")
    if DO_GALEX_STUFF is True:

        galex_slices = defaultdict(list)
        systems = {}
        for eclipse in galex_eclipses:
            eclipse_targets = tuple(filter(lambda t: eclipse in t['galex'], targets))
            slices, system = get_galex_rice_slices(
                eclipse, eclipse_targets, CUTOUT_SIDE_LENGTH, S3_ROOT, watch, stat
            )
            systems[eclipse] = system
            for k, v in slices.items():
                galex_slices[k].append(v)
        print(f"acquired GALEX cutouts,{outer_stat()}")
        galex_coadds = {}
        print(
            f"...coadding {len(tuple(chain.from_iterable(galex_slices.values())))} image slices...", 
            end=""
        )
        for obj_id, images in galex_slices.items():
            if len(images) == 0:
                print("all GALEX images for {obj_id} are bad, skipping")
            galex_coadds[obj_id] = coadd_image_slices(images, systems)
        print(f"coadded GALEX cutouts,{outer_stat()}")
        


... accessing PS1 stack image(s) w/proj cell, sky cell = 2554, 80 ...
... initializing rings.v3.skycell.2554.080.stk.g.unconv.fits ... 
1.96 s,68.68 MB
init fits object,/mnt/s3/ps1/rings.v3.skycell/2554/080/rings.v3.skycell.2554.080.stk.g.unconv.fits,None
0.02 s,0.0 MB
0.01 s,0.0 MB
0.02 s,0.0 MB
0.02 s,0.0 MB
2.04 total s,68.68 total MB
got 2 cuts,/mnt/s3/ps1/rings.v3.skycell/2554/080/rings.v3.skycell.2554.080.stk.g.unconv.fits,None
... initializing rings.v3.skycell.2554.080.stk.z.unconv.fits ... 
2.01 s,69.24 MB
init fits object,/mnt/s3/ps1/rings.v3.skycell/2554/080/rings.v3.skycell.2554.080.stk.z.unconv.fits,None
0.02 s,0.0 MB
0.01 s,0.0 MB
0.02 s,0.0 MB
0.02 s,0.0 MB
2.08 total s,69.24 total MB
got 2 cuts,/mnt/s3/ps1/rings.v3.skycell/2554/080/rings.v3.skycell.2554.080.stk.z.unconv.fits,None
... accessing PS1 stack image(s) w/proj cell, sky cell = 1464, 9 ...
... initializing rings.v3.skycell.1464.009.stk.g.unconv.fits ... 
1.8 s,67.31 MB
init fits object,/mnt/s3/ps1/rings.v3.skycel

0.05 s,3.44 MB
0.08 s,0.79 MB
0.14 s,2.7 MB
0.05 s,0.76 MB
0.05 s,1.16 MB
0.05 s,2.47 MB
0.09 s,0.76 MB
0.1 s,2.75 MB
0.06 s,3.51 MB
0.08 s,0.56 MB
0.1 s,2.61 MB
0.14 s,2.7 MB
0.08 s,0.3 MB
0.02 s,0.0 MB
0.02 s,0.07 MB
0.02 s,0.08 MB
0.02 s,0.0 MB
0.07 s,0.32 MB
0.05 s,1.16 MB
0.02 s,0.0 MB
0.02 s,0.0 MB
0.05 s,1.8 MB
0.06 s,1.88 MB
0.05 s,0.4 MB
0.09 s,0.75 MB
0.04 s,0.54 MB
0.02 s,0.0 MB
0.11 s,1.23 MB
0.02 s,0.22 MB
0.02 s,0.0 MB
0.05 s,1.26 MB
0.02 s,0.0 MB
0.09 s,1.91 MB
0.02 s,0.0 MB
0.05 s,0.81 MB
0.02 s,0.01 MB
3.47 total s,58.39 total MB
got 44 cuts,/mnt/s3/ps1/rings.v3.skycell/1464/009/rings.v3.skycell.1464.009.stk.z.unconv.fits,None
... accessing PS1 stack image(s) w/proj cell, sky cell = 2048, 95 ...
... initializing rings.v3.skycell.2048.095.stk.g.unconv.fits ... 
0.18 s,0.22 MB
init fits object,/mnt/s3/ps1/rings.v3.skycell/2048/095/rings.v3.skycell.2048.095.stk.g.unconv.fits,None
0.06 s,0.16 MB
0.01 s,0.01 MB
0.16 s,2.52 MB
0.07 s,1.44 MB
0.24 s,3.18 MB
0.11 s,3.04 MB
0.2