# Integrating pyhydrophone in the processing pipeline


## Some parameters for PBP

We will process the data from station MB05, which was recorded with a SoundTrap, and we'll use pyhydrophone to obtain the calibration information

Let's start importing the packages we'll need

In [None]:
# Import package modules
import xarray as xr
import dask
import pandas as pd
import time
import datetime
import logging
import pyhydrophone as pyhy
from google.cloud.storage import Client as GsClient  # To handle download of `gs:` resources


from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator
from pbp.logging_helper import create_logger_info, create_logger

from pbp.process_helper import ProcessHelper
from pbp.file_helper import FileHelper

And describe where are the data stored

In [None]:
# Audio data input specifications
wav_uri              = 's3://pacific-sound-mb05'   # cloud storage location for the input audio data
start_date           = '20220922'   # start date for temporal metadata extraction (YYYYMMDD)
end_date             = '20220925'   # end date for temporal metadata extraction (YYYYMMDD)
json_base_dir        = 'metadata/json' # location to store generated metadata in JSON format
xml_dir              = 'metadata/xml' # location to store downloaded xml files from the soundtrap

# Output data specifications
download_dir        = 'downloads'
output_dir          = 'hmd_output'
output_prefix       = 'MB05_'

# ST information 
serial_number        = 6715      # ST serial number
st_model             = 'SoundTrap 600 HF'  # Needs to match the right model in the ST calibration website: https://oceaninstruments.azurewebsites.net
st_gain              = 'High'      # Can be 'High' or 'Low'
subset_to            = (10, 2_000)  # min, max frequencies to subset to 

# Information of the metadata which will be included in the netcdf files
global_attrs_uri    = 'metadata/mb05/globalAttributes_NRS11.yaml'
variable_attrs_uri  = 'metadata/mb05/variableAttributes_NRS11.yaml'

In [None]:
 # a logger that only logs messages tagged as info to the console, for more verbose logging
log = create_logger_info(f'soundtrap_{start_date}_{end_date}')

# Convert the start and end dates to datetime objects
start = datetime.strptime(start_date, "%Y%m%d")
end = datetime.strptime(end_date, "%Y%m%d")

# Create the metadata generator
meta_gen = SoundTrapMetadataGenerator(
        log=log,
        uri=wav_uri,
        json_base_dir=json_base_dir,
        xml_dir=xml_dir,
        start=start,
        end=end,
        prefixes=[str(serial_number)])

# Generate the metadata - this will generate JSON files in the json_base_dir
meta_gen.run()

In [None]:
st = pyhy.SoundTrap(model=st_model,
                    serial_number=serial_number,
                    name=f'{st_model}_{serial_number}',
                    gain_type=st_gain)
print('SoundTrap settings to:')
print('sensitivity: ', st.sensitivity)
print('Vpp: ', st.Vpp)
print('preamp_gain: ', st.preamp_gain)
print('gain_type: ', 'High')

# Supporting functions

PBP includes these two main modules that we will be using below:

- `FileHelper`: Facilitates input file reading. It supports reading local files as well as from GCP (`gs://` URIs) and AWS (`s3://` URIs).
- `ProcessHelper`: The main processing module.

We first define a function that takes care of HMB generation for a given date.

Based on that function, we then define one other function to dispatch multiple dates in parallel.


## A function to process a given day

Supported by those PBP modules, we define a function that takes care of processing a given day:

In [None]:
def process_date(date: str, gen_netcdf: bool = True):
    """
    Main function to generate the HMB product for a given day.

    It makes use of supporting elements in PBP in terms of logging,
    file handling, and PyPAM based HMB generation.

    :param date: Date to process, in YYYYMMDD format.

    :param gen_netcdf:  Allows caller to skip the `.nc` creation here
    and instead save the datasets after all days have been generated
    (see parallel execution below).

    :return: the generated xarray dataset.
    """

    log_filename = f"{output_dir}/{output_prefix}{date}.log"

    logger = create_logger(
        log_filename_and_level=(log_filename, logging.INFO),
        console_level=None,
    )

    # we are only downloading publicly accessible datasets:
    gs_client = GsClient.create_anonymous_client()

    file_helper = FileHelper(
        logger=logger,
        json_base_dir=json_base_dir,
        gs_client=gs_client,
        download_dir=download_dir,
        assume_downloaded_files=True,
        retain_downloaded_files=True,
    )
    
    process_helper = ProcessHelper(
        log=log,
        file_helper=file_helper,
        output_dir=output_dir,
        output_prefix=output_prefix,
        global_attrs_uri=global_attrs_uri,
        variable_attrs_uri=variable_attrs_uri,
        voltage_multiplier=st.Vpp/2,   # For pyhydrophone, Vpp is the voltage peak-to-peak, while for pbp voltage multiplier is 0 to peak
        sensitivity_uri=None,
        sensitivity_flat_value=-st.sensitivity,  # Please note the minus (-) signal
        subset_to=subset_to,
    )

    ## now, get the HMB result:
    print(f'::: Started processing {date=}    {log_filename=}')
    result = process_helper.process_day(date)

    if gen_netcdf:
        nc_filename = f"{output_dir}/{output_prefix}{date}.nc"
        print(f':::   Ended processing {date=} =>  {nc_filename=}')
    else:
        print(f':::   Ended processing {date=} => (dataset generated in memory)')

    if result is not None:
        return result.dataset
    else:
        print(f'::: UNEXPECTED: no segments were processed for {date=}')

## A function to process multiple days

We use [Dask](https://examples.dask.org/delayed.html) to dispatch, in parallel, multiple instances of the `process_date` function defined above.

In [6]:
def process_multiple_dates(dates: list[str], gen_netcdf: bool = False) -> list[xr.Dataset]:
    """
    Generates HMB for multiple days in parallel using Dask.
    Returns the resulting HMB datasets.
    
    :param dates: The dates to process, each in YYYYMMDD format.

    :param gen_netcdf:  Allows caller to skip the `.nc` creation here
    and instead save the datasets after all days have been generated.

    :return: the list of generated datasets.
    """

    @dask.delayed
    def delayed_process_date(date: str):
        return process_date(date, gen_netcdf=gen_netcdf)
    
    ## To display total elapsed time at the end the processing:
    start_time = time.time()

    ## This will be called by Dask when all dates have completed processing:
    def aggregate(*datasets) -> list[xr.Dataset]:
        elapsed_time = time.time() - start_time
        print(f'===> All {len(datasets)} dates completed. Elapsed time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} mins)')
        return datasets


    ## Prepare the processes:
    delayed_processes = [delayed_process_date(date) for date in dates]
    aggregation = dask.delayed(aggregate)(*delayed_processes)

    ## And launch them:
    return aggregation.compute()


In [None]:
# In general, we can use pandas to help us generate the list of dates we want to process
date_range = pd.date_range(start=start_date, end=end_date, freq='1D')
dates = date_range.strftime("%Y%m%d").tolist()

# Now, launch the generation:
print(f"Launching HMB generation for {len(dates)} {dates=}")

# Get all HMB datasets:
generated_datasets = process_multiple_dates(dates, gen_netcdf=True)
print(f"Generated datasets: {len(generated_datasets)}\n")