# fish tracking

TOC:

0. imports
1. configuration
2. paths (input data, outroot)
3. cluster config
4. open tag log and reference model
    1. read tag data and clean
    2. read reference model, clean and subset
6. compute differences
    1. align time
    2. convert model depth to appropriate units (sigma → depth (→ pressure))
    3. subtract tags from aligned model
    4. save
7. regrid to healpix
    1. define healpix grid
    2. create target grid
    3. compute weights
    4. regrid
    5. save
8. construct emission matrix
    1. compute ocean mask
    2. compute normal probability density function from differences and the configured parameters
    3. compute maximum expected model parameter
    4. save
9. additional emission matrices: acoustic ranges
    1. open and read acoustic detections for the selected tag
    2. convert times to UTC
    3. aggregate detections and compute weights
    4. construct detection maps
    5. weighted sum of the detection maps
    6. save
10. estimate the model parameter
    1. select and create estimator instance
    2. create an optimizer using the estimator and the expected parameter range
    3. fit the model to the data to get the model parameter
    4. save
11. compute the state probabilities
    1. use the configured estimator to predict the state probabilities
    2. save
12. compute the tracks
    1. compute mean and mode from the precomputed state probabilities and apply the viterbi algorithm to the emission matrix to get the most probable track
    2. save
13. visualization
    1. plot the emission matrix
    2. plot the state probabilities
    3. plot each of the tracks

In [None]:
import json

import cf_xarray
import dask
import fsspec
import intake
import numba
import numpy as np
import pandas as pd
import pint_xarray
import xarray as xr
from pint_xarray import unit_registry as ureg

In [None]:
from pangeo_fish.hmm.estimator import EagerScoreEstimator
from pangeo_fish.io import open_tag
from pangeo_fish.pdf import combine_emission_pdf

configuration

<div class="alert alert-info">

**Note**: This contains model parameters only
    
</div>

In [None]:
tag_name = "A19124"
bbox = {"lat": [45, 51], "lon": [-8, 0]}

relative_depth_threshold = 0.6

nside = 4096
rot = {"lat": 0, "lon": 0}
min_vertices = 3

differences_std = 0.75
recapture_std = 1e-2

earth_radius = ureg.Quantity(6371, "km")
adjustment_factor = 5  # for a more fuzzy search
maximum_speed = ureg.Quantity(60, "km / day")
truncate = 4

receiver_buffer = ureg.Quantity(1000, "m")

tolerance = 1e-2

track_modes = ["mean", "mode", "viterbi", "viterbi2"]
additional_track_quantities = ["speed", "distance"]

paths

In [None]:
tag_root = "./data/"
tag_db_url = "./data/all_pollack_11102023.csv"
tag_detections_url = "./data/pollack_filtered_detections.csv"

catalog_url = "https://data-taos.ifremer.fr/kerchunk/ref-copernicus.yaml"

scratch_root = "."
target_root = f"{scratch_root}/{tag_name}"
tracks_root = f"{target_root}/tracks"

cluster config

In [None]:
from distributed import LocalCluster

cluster = LocalCluster()
client = cluster.get_client()
client

open tag data

In [None]:
tag = open_tag(tag_root, tag_name)
tag

## compare reference model with tag log

In [None]:
from pangeo_fish.cf import bounds_to_bins
from pangeo_fish.dataset_utils import broadcast_variables
from pangeo_fish.diff import diff_z
from pangeo_fish.io import open_copernicus_catalog
from pangeo_fish.tags import adapt_model_time, reshape_by_bins, to_time_slice

drop data outside the reference interval

In [None]:
time_slice = to_time_slice(tag["tagging_events/time"])
tag_log = tag["dst"].ds.sel(time=time_slice)
tag_log

open and clean reference model

In [None]:
# TODO: use intake to do the merging? Or do the merging directly in `kerchunk`?
cat = intake.open_catalog(catalog_url)
model = open_copernicus_catalog(cat)
model

In [None]:
model_subset = (
    # align model time with tag log
    model.sel(time=adapt_model_time(time_slice))
    # subset the data to the region of interest using a bbox
    .sel(lat=slice(*bbox["lat"]), lon=slice(*bbox["lon"]))
    # drop data for depth layers that are too unlikely
    .pipe(
        lambda ds: ds.sel(
            depth=slice(None, (tag_log["pressure"].max() - ds["XE"].min()).compute())
        )
    )
    # broadcast spatial coordinates to 2D
    .pipe(broadcast_variables, {"lat": "latitude", "lon": "longitude"})
)
model_subset

convert sigma level to depth

In [None]:
reference_model = (
    model_subset.rename({"depth": "level"})
    .chunk({"level": -1})
    .assign(
        {
            "depth": lambda ds: ds["XE"] + ds["level"],
            "bottom": lambda ds: ds["XE"] + ds["H0"],
        }
    )
)
reference_model

reshape the tag log

In [None]:
%%time
reshaped_tag = reshape_by_bins(
    tag_log,
    dim="time",
    bins=(
        reference_model.cf.add_bounds(["time"], output_dim="bounds")
        .pipe(bounds_to_bins, bounds_dim="bounds")
        .get("time_bins")
    ),
    bin_dim="bincount",
    other_dim="obs",
).chunk({"time": 1})
reshaped_tag

subtract the tag data from the model

In [None]:
%%time
diff = (
    # TODO: rewrite the function to make it composable, using a depth threshold is too sharp.
    diff_z(reference_model, reshaped_tag, depth_threshold=relative_depth_threshold)
    .assign_attrs({"tag_id": tag.attrs["pit_tag_id"]})
    .assign(
        {
            "H0": reference_model["H0"],
            "ocean_mask": reference_model["H0"].notnull(),
        }
    )
    .chunk({"time": 1, "lat": -1, "lon": -1})
)
diff

save snapshot to disk

In [None]:
%%time
diff.to_zarr(f"{target_root}/diff.zarr", mode="w", consolidated=True)

cleanup

In [None]:
del tag_log, cat, model, model_subset, reference_model, reshaped_tag, diff

## regrid to healpix

In [None]:
from xarray_healpy import HealpyGridInfo, HealpyRegridder

from pangeo_fish.grid import center_longitude

open the diff and clean

In [None]:
%%time
ds = (
    xr.open_dataset(f"{target_root}/diff.zarr", engine="zarr", chunks={})
    .pipe(lambda ds: ds.merge(ds[["latitude", "longitude"]].compute()))
    .swap_dims({"lat": "yi", "lon": "xi"})
    .drop_vars(["lat", "lon"])
)
ds

define the target grid

In [None]:
grid = HealpyGridInfo(level=int(np.log2(nside)), rot=rot)
grid

In [None]:
target_grid = grid.target_grid(ds).pipe(center_longitude, 0)
target_grid

compute the interpolation weights

In [None]:
%%time
regridder = HealpyRegridder(
    ds[["longitude", "latitude", "ocean_mask"]],
    target_grid,
    method="bilinear",
    interpolation_kwargs={"mask": "ocean_mask", "min_vertices": min_vertices},
)
regridder

regrid

In [None]:
%%time
regridded = regridder.regrid_ds(ds)
regridded

reshape to 2D

In [None]:
%%time
reshaped = grid.to_2d(regridded).pipe(center_longitude, 0)
reshaped

save

In [None]:
%%time
reshaped.to_zarr(
    f"{target_root}/diff-regridded.zarr",
    mode="w",
    consolidated=True,
    compute=True,
)

cleanup

In [None]:
del ds, grid, target_grid, regridder, regridded, reshaped

## construct emission matrix

In [None]:
from pangeo_fish.distributions import create_covariances, normal_at
from pangeo_fish.pdf import normal
from pangeo_fish.utils import temporal_resolution

open data

In [None]:
differences = xr.open_dataset(
    f"{target_root}/diff-regridded.zarr",
    engine="zarr",
    chunks={},
)
differences

initial and final position

In [None]:
grid = differences[["latitude", "longitude"]].compute()

initial_position = tag["tagging_events"].ds.sel(event_name="release")
cov = create_covariances(1e-6, coord_names=["latitude", "longitude"])
initial_probability = normal_at(
    grid, pos=initial_position, cov=cov, normalize=True, axes=["latitude", "longitude"]
)

final_position = tag["tagging_events"].ds.sel(event_name="fish_death")
cov = create_covariances(recapture_std**2, coord_names=["latitude", "longitude"])
final_probability = normal_at(
    grid, pos=final_position, cov=cov, normalize=True, axes=["latitude", "longitude"]
)

maximum displacement

In [None]:
earth_radius_ = xr.DataArray(earth_radius, dims=None)

timedelta = temporal_resolution(differences["time"]).pint.quantify().pint.to("h")
grid_resolution = earth_radius_ * differences["resolution"].pint.quantify()

maximum_speed_ = xr.DataArray(maximum_speed, dims=None).pint.to("km / h")
max_grid_displacement = maximum_speed_ * timedelta * adjustment_factor / grid_resolution

max_sigma = max_grid_displacement.pint.to("dimensionless").pint.magnitude / truncate
max_sigma

emission probability matrix

In [None]:
%%time
emission_pdf = (
    normal(differences["diff"], mean=0, std=differences_std, dims=["x", "y"])
    .to_dataset(name="pdf")
    .assign(
        {
            "initial": initial_probability,
            "final": final_probability,
            "mask": differences["ocean_mask"],
        }
    )
    .assign_attrs(differences.attrs | {"max_sigma": max_sigma})
    .chunk()
)
emission_pdf

save

In [None]:
emission_pdf.to_zarr(f"{target_root}/emission.zarr", mode="w", consolidated=True)

cleanup

In [None]:
del differences, grid, initial_probability, final_probability, emission_pdf

## additional emission probability matrix from acoustic ranges

In [None]:
from pangeo_fish import acoustic, utils

open data and clean

In [None]:
emission = xr.open_dataset(
    f"{target_root}/emission.zarr", engine="zarr", chunks={"x": -1, "y": -1}
)

construct the emission probabilities

In [None]:
acoustic_pdf = acoustic.emission_probability(
    tag, emission[["time", "cell_ids", "mask"]].compute(), receiver_buffer
)
combined = emission.merge(acoustic_pdf)
combined

save

In [None]:
combined.to_zarr(f"{target_root}/emission-acoustic.zarr", mode="w", consolidated=True)

cleanup

In [None]:
del emission, acoustic_pdf, combined

## estimate the model parameter

In [None]:
cluster.scale(1)

In [None]:
from pangeo_fish.hmm.estimator import EagerScoreEstimator
from pangeo_fish.hmm.optimize import EagerBoundsSearch

open the data

In [None]:
emission = (
    xr.open_dataset(
        f"{target_root}/emission-acoustic.zarr",
        engine="zarr",
        chunks={"x": -1, "y": -1, "time": "auto"},
        inline_array=True,
    )
    .pipe(combine_emission_pdf)
    .compute()  # convert to comment if the emission matrix does *not* fit in memory
)
emission

create and configure estimator and optimizer

In [None]:
estimator = EagerScoreEstimator()

optimizer = EagerBoundsSearch(
    estimator,
    (1e-4, emission.attrs["max_sigma"]),
    optimizer_kwargs={"disp": 3, "xtol": tolerance},
)

fit the model parameter to the data

In [None]:
%%time
optimized = optimizer.fit(emission)
optimized

save

In [None]:
params = optimized.to_dict()
with fsspec.open(f"{target_root}/parameters.json", mode="w") as f:
    json.dump(params, f)

## state probabilities

recreate the estimator

In [None]:
with fsspec.open(f"{target_root}/parameters.json", mode="r") as f:
    params = json.load(f)
optimized = EagerScoreEstimator(**params)
optimized

load the data

In [None]:
emission = xr.open_dataset(
    f"{target_root}/emission-acoustic.zarr",
    engine="zarr",
    chunks={"x": -1, "y": -1, "time": "auto"},
    inline_array=True,
).pipe(combine_emission_pdf)
emission

predict the state probabilities

In [None]:
%%time
states = optimized.predict_proba(emission)
states

save

In [None]:
%%time
states.chunk({"time": "auto", "x": -1, "y": -1}).to_zarr(
    f"{target_root}/states.zarr", mode="w", consolidated=True
)

cleanup

In [None]:
del states

## track decoding

In [None]:
from pangeo_fish import tracks
from pangeo_fish.hmm.estimator import EagerScoreEstimator

open data

In [None]:
emission = (
    xr.open_dataset(
        f"{target_root}/emission-acoustic.zarr",
        engine="zarr",
        chunks={"x": -1, "y": -1, "time": "auto"},
        inline_array=True,
    )
    .pipe(combine_emission_pdf)
    .compute()
)
emission

In [None]:
states = xr.open_dataset(
    f"{target_root}/states.zarr", engine="zarr", chunks={}, inline_array=True
).compute()
states

In [None]:
with fsspec.open(f"{target_root}/parameters.json", mode="r") as f:
    params = json.load(f)
optimized = EagerScoreEstimator(**params)
optimized

decode tracks

In [None]:
%%time
trajectories = optimized.decode(emission, states, mode=track_modes, progress=True).pipe(
    tracks.additional_quantities, quantities=additional_track_quantities
)
trajectories

save

In [None]:
from pangeo_fish.io import save_trajectories

In [None]:
save_trajectories(trajectories, tracks_root, format="parquet")

cleanup

In [None]:
del emission, states, trajectories