In [None]:
"""galex-specific s3-via-FUSE FITS subsetting test and benchmarking notebook"""

import os
from functools import partial
from itertools import product
from typing import Callable

from s3_fuse.galex_utils import (
    pick_galex_eclipses, get_galex_version_path
)
from s3_fuse.handlers import get_cuts_from_files
from s3_fuse.log_goofys import assemble_cut_log
from s3_fuse.utilz import preload_target

### settings

In [None]:
# what files shall we examine?
GALEX_FILE_SELECTIONS = {
    "band": "NUV",
    "depth": 30,
    "eclipses": pick_galex_eclipses(5, "mislike"),
    # 'movie' or 'image'
    "obj": "image"
}

# what image plane?
IMAGE_PLANE = 'cnt'
BASE_HDU_IX = {'cnt': 0, 'flag': 1, 'edge': 2}[IMAGE_PLANE]

# how many and what kind of slices should we cut out of these files?
CUT_SETTINGS = {"cut_count": 1, "box_size": 100, "size_variance": 25}

GENERAL_SETTINGS = {
    # do we actually want the array elements?
    "return_cuts": True,
    "cut_settings": CUT_SETTINGS,
    # shared random seed for strict repeatability
    "seed": 333,
    # make 'shallow' cuts from 3D arrays? if False, make 3D cuts across
    # all bands; if True, randomly select a single band for each cut
    "shallow": True,
}

# what directory on the local filesystem shall we use as a mountpoint?
# (or where is the bucket mounted already if we're not remounting?)
S3_ROOT = "/mnt/s3"
# where will we write goofys log output? not important if we're not doing that.
FUSE_LOGFILE = "fuse.log"
# what is the name of our s3 bucket?
BUCKET = "great_bucket_full_of_fits_files"

S3_SETTINGS = {
    "mount_path": S3_ROOT,
    "bucket": BUCKET,
    # remount the bucket on each cycle? (not doing so is cheating because of
    # inode linking, etc.)
    "remount": True,
    # whatever FUSE backend you're using needs to be installed on the system.
    # has only been prototyped with s3fs and goofys, but others are likely
    # to work. this is not important if you set remount to False and have
    # already mounted the bucket.
    "backend": "goofys",
    # only matters with goofys: run goofys in debug mode and scratch its
    # output to a log file.
    "verbose": True,
    # goofys writes its debug-mode output to stderr.
    # set both of these handlers to None if you'd like to
    # deactivate logging. this will break on verbose goofys output,
    # however, because the test routine won't know to
    # wait for goofys to mount the bucket.
    # note that adding a stream handler to the goofys introduces a
    # little overhead even if it's not writing anything, and
    # that goofys may be noticeably slower when actually writing
    # debug logs (it's very verbose)
    "stream_handlers": {"_out": None, "_err": FUSE_LOGFILE},
}

# note: this function is a setting
def make_loaders(*loader_names: str) -> dict[str, Callable]:
    """
    produce a mapping from FITS-loader names to callable load methods.
    currently only three are defined by default.
    """
    loaders = {}
    for name in loader_names:
        if name == "astropy":
            import astropy.io.fits
            loaders[name] = astropy.io.fits.open
        elif name == "fitsio":
            import fitsio
            loaders[name] = fitsio.FITS
        # "greedy" version of astropy.io.fits.open, which fully loads a file
        # into memory before doing anything with it. a useful bench reference.
        # note that fitsio.FITS will not accept filelike objects and cannot be
        # wrapped in this way without modifying its C extensions.
        elif name == "greedy_astropy":
            import astropy.io.fits
            loaders[name] = partial(preload_target, astropy.io.fits.open)
    return loaders

# select loaders as defined by the previous function
# LOADERS = make_loaders("astropy", "greedy_astropy", "fitsio")
LOADERS = make_loaders("astropy")

# what versions of compression do we want to look at? options are
# gz, rice, none
# VERSIONS = ("none", "rice")
VERSIONS = ("none",)

# note: this function is a setting
def is_terrible(loader_name, version):
    """ignore these cases: checking them is a waste of time"""
    if (
        (GALEX_FILE_SELECTIONS["obj"] == "movie")
        and (version in ("gz", "none"))
        and (S3_SETTINGS["backend"] == "goofys")
        and (loader_name == "fitsio")
    ):
        return True

### define the benchmark

In [None]:
# helper function
def get_test_paths(eclipses, band, depth, obj, version):
    return [
        get_galex_version_path(eclipse, band, depth, obj, version, S3_ROOT)
        for eclipse in eclipses
    ]
    
def run_test():
    results = {}
    for loader_name, version in product(LOADERS.keys(), VERSIONS):
        case_name = f"{loader_name} {version}"
        print(f"\n****checking {case_name}****")
        if is_terrible(loader_name, version):
            print("case marked as terrible, skipping")
            continue
        paths = get_test_paths(**GALEX_FILE_SELECTIONS, version=version)
        case_kwargs = GENERAL_SETTINGS | {
            "paths": paths,
            "loader": LOADERS[loader_name],
            "cut_settings": CUT_SETTINGS,
            "s3_settings": S3_SETTINGS,
            "hdu_ix": BASE_HDU_IX + 1 if version == "rice" else BASE_HDU_IX,
        }
        cuts, runtime, handler_log = get_cuts_from_files(**case_kwargs)
        # TODO: sloppy
        if S3_SETTINGS["stream_handlers"]["_err"] is not None:
            with open(FUSE_LOGFILE) as logstream:
                fuse_log = logstream.read()
            os.remove(FUSE_LOGFILE)
        else:
            fuse_log = None
        results[case_name] = cuts, runtime, None, handler_log, fuse_log
    logs = {
        case_name: assemble_cut_log(result)
        for case_name, result in results.items()
    }
    return results, logs

### run the benchmark

In [None]:
results, logs = run_test()

### look at logs, returned data, etc.

In [None]:
cuts, runtime, handler_log, fuse_log = results['astropy none']
runtime

In [None]:
# this log contains timestamped records of all 'handler' operations
# like mounting the bucket, initializing a FITS object, etc.
# if goofys was in debug/verbose mode, it will also contain
# all S3 requests and aliased-throuugh-FUSE http stream reads.
import pandas as pd

log_df = pd.DataFrame(
    logs["astropy none"]
).sort_values(by="time").reset_index(drop=True)
log_df