In [None]:
"""
s3-via-FUSE FITS subsetting benchmarking notebook. 
runs against objects generated using benchmark_case_generator.ipynb.
"""
import os
from functools import partial
from itertools import chain, product
from typing import Callable

from s3_fuse.bench_config import CASES
from s3_fuse.handlers import get_cuts_from_files
from s3_fuse.log_goofys import assemble_cut_log
from s3_fuse.utilz import preload_target, Netstat, mb

In [None]:
# how many mock fits files did we generate per case? 
# TODO: add a listobjects step?
QUANTITY = 10

# how many and what kind of slices should we cut out of these files?

CUT_COUNTS = (1, 3)
BOX_SIZES = (100, 250)
CUT_CASES = {}
for count, size in product(CUT_COUNTS, BOX_SIZES):
    CUT_CASES[f"{count}_{size}"] = (count, size, size / 4)
    
    
# note: this function is a setting
def make_loaders(*loader_names: str) -> dict[str, Callable]:
    """
    produce a mapping from FITS-loader names to callable load methods.
    currently only three are defined.
    """
    loaders = {}
    for name in loader_names:
        if name == "astropy":
            import astropy.io.fits
            loaders[name] = astropy.io.fits.open
        elif name == "fitsio":
            import fitsio
            loaders[name] = fitsio.FITS
        # "greedy" version of astropy.io.fits.open, which fully loads a file
        # into memory before doing anything with it. a useful bench reference.
        # note that fitsio.FITS will not accept filelike objects and cannot be
        # wrapped in this way without modifying its C extensions.
        elif name == "greedy_astropy":
            import astropy.io.fits
            loaders[name] = partial(preload_target, astropy.io.fits.open)
    return loaders


# select loaders as defined by the previous function
LOADERS = make_loaders("astropy", "fitsio", "greedy_astropy")

# select FUSE backends
BACKENDS = ("s3fs", "goofys")

GENERAL_SETTINGS = {
    # do we actually want the array elements?
    "return_cuts": False,
    # shared random seed for strict repeatability
    "seed": 11111,
    # make 'shallow' cuts from 3D arrays? if False, make 3D cuts across
    # all bands; if True, randomly select a single band for each cut
    "shallow": True,
}


In [None]:
# what directory on the local filesystem shall we use as a mountpoint?
# (or where is the bucket mounted already if we're not remounting?)
S3_ROOT = "/mnt/s3"
# where will we write goofys log output? not important if we're not doing that.
FUSE_LOGFILE = "fuse.log"
# what is the name of our s3 bucket?
BUCKET = "great_bucket_full_of_fits_files"


S3_SETTINGS = {
    "mount_path": S3_ROOT,
    "bucket": BUCKET,
    # remount the bucket on each cycle? (not doing so is cheating because of
    # inode linking, etc.)
    "remount": True,
    # only matters with goofys: run goofys in debug mode and scratch its
    # output to a log file.
    "verbose": False,
    # goofys writes its debug-mode output to stderr.
    # set both of these handlers to None if you'd like to
    # deactivate logging. this will break on verbose goofys output,
    # however, because the test routine won't know to
    # wait for goofys to mount the bucket.
    # note that adding a stream handler to the goofys introduces a
    # little overhead even if it's not writing anything, and
    # that goofys may be noticeably slower when actually writing
    # debug logs (it's very verbose)
    "stream_handlers": {"_out": None, "_err": None},
}


BENCH_CASES = tuple(product(CUT_CASES.keys(), CASES.keys(), LOADERS.keys(), BACKENDS))

### define the benchmark
   

In [None]:
def run_benchmark():
    results = {}
    netstat = Netstat()
    for ix, case in enumerate(BENCH_CASES):
        cut_setting, prefix, loader, backend = case
        # TODO: hacky
        if "3hdu" in prefix:
            hdu_ix = 2
        else:
            hdu_ix = 1
        # skip these cases on this run 0 -- TODO: should be an is_terrible()
        # sort of function
        if "rice" not in prefix.lower():
            continue
            
        print(f"\n****case {ix}: {case}****")
        paths = [f"{S3_ROOT}/{prefix}/{prefix}_{ix}.fits" for ix in range(QUANTITY)]
        case_kwargs = GENERAL_SETTINGS | {
            "paths": paths,
            "loader": LOADERS[loader],
            "cut_settings": {
                kwarg: val 
                for kwarg, val 
                in zip(("cut_count", "box_size", "size_variance"), CUT_CASES[cut_setting])
            },
            "s3_settings": S3_SETTINGS | {"backend": backend},
            "hdu_ix": hdu_ix
        }
        netstat.update()
        cuts, runtime, handler_log = get_cuts_from_files(**case_kwargs)
        netstat.update()
        # or could specify ens5 or eth0 or whatever if there is ambiguity. up to you.
        volume = list(netstat.interval.values())[0]
        representative_size = os.stat(paths[0]).st_size
        ratio = volume / (representative_size * QUANTITY)
        print(
            f"{round(mb(volume), 2)} MB transferred "
            f"({round(ratio*100, 2)}% of approximate file volume)"
        )
        results[case] = (
            cuts, 
            runtime, 
            {"volume": volume, "rep_size": representative_size, "ratio": ratio}, 
            handler_log, 
            None
        )
    logs = {
        case_name: assemble_cut_log(result)
        for case_name, result in results.items()
    }
    return results, logs

### run the benchmark

In [None]:
results, logs = run_benchmark()