# Load data from Smart-Kages into `movement` datasets
Load all DLC .h5 pose files for each kage and concatenate them
into a single `movement` dataset.

Assign a datetime index across the `time` dimension for easy access.

Save the resulting dataset to a netCDF file.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr
from matplotlib import pyplot as plt
from movement.io import load_poses
from tqdm import tqdm

from smart_kages_movement.io import load_background_frame

## Configuration
Define some global variables and paths.

In [5]:
FPS = 2  # frames per second
PIXELS_PER_CM = 10  # pixels per centimetre (need to double-check this)
TIME_PRECISION = "ns"

# Configure seaborn for prettier plots
sns.set_context("notebook")
sns.set_style("ticks")

In [6]:
data_dir = Path("/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB")
analysis_dir = data_dir.parent / "movement_analysis"
# csv file generated by the 01 notebook
df_path = analysis_dir / "all_segments.csv"


for path in [data_dir, analysis_dir, df_path]:
    if not path.exists():
        print(f"Path does not exist: {path}")

## Load CSV files as dataframe

We load the dataframe containing the paths to all the DLC .h5 files, which is generated in the 01 notebook.

In [7]:
df = pd.read_csv(
    df_path,
    index_col=[0, 1, 2],
    dtype={
        "date": str,
        "hour": str,
        "n_frames": int,
    },
    parse_dates=["start_datetime", "end_datetime"],
)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_datetime,end_datetime,n_frames,n_channels,height,width,pose_file_path,video_file_path,timestamps_file_path
kage,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kage1,20240403,9,2024-04-03 09:54:24,2024-04-03 09:59:59.495354911,665,3,376,500,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...
kage1,20240403,10,2024-04-03 10:00:06,2024-04-03 10:59:59.506997932,7183,3,376,500,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...
kage1,20240403,11,2024-04-03 11:01:07,2024-04-03 11:59:59.505496638,7064,1,376,500,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...
kage1,20240403,12,2024-04-03 12:01:08,2024-04-03 12:59:59.510819259,7059,3,376,500,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...
kage1,20240403,13,2024-04-03 13:01:07,2024-04-03 13:59:59.509458847,7058,3,376,500,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...,/mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/DB...


We see that the dataframe holds various information on each 1-hour segment of the data.

The index is hierarchical, organising the data first by `kage`, then by `date`, and finally by `hour`.

Of special relevance to us here:
- Paths: `pose_file_path`, `video_file_path`, `timestamps_file_path`
- `start_datetime` and `end_datetime` for each segment

## Load all data from a given kage
We will create a function that, given a kage name, will load all the data from the DLC .h5 files and merge them into a single `movement` dataset.

In [8]:
def kage_to_movement_ds(
    df: pd.DataFrame,
    kage: str,
) -> tuple[xr.Dataset, np.ndarray]:
    """Load all poses for a given kage and return an xarray Dataset.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the paths to pose files as well as metadata
        for each 1-hour segment.
    kage : str
        The name of the kage to process, e.g., "kage1", "kage2", etc.

    Returns
    -------
    xr.Dataset
        An xarray Dataset containing the poses for the specified kage,
        with time coordinates assigned based on the corrected timestamps.
    np.ndarray
        A background image (numpy array) loaded from the middle segment
        of the kage, used for visualization purposes.

    Notes
    -----
    The returned Dataset will have two time coordinates:
    - ``time``: the primary time coordinate based on datetime timestamps.
    - ``seconds_since_start``: the secondary time coordinate representing
       seconds elapsed since the start of the kage.
    """

    def _is_monotonic_increasing(arr):
        """Check if a 1D array is monotonically increasing."""
        return (arr[1:] >= arr[:-1]).all()

    print(f"Processing kage: {kage}")
    df_kage = df.loc[kage]
    df_kage = df_kage.sort_index()  # ensure chronological order
    n_days = df_kage.index.get_level_values("date").nunique()
    print(f"Number of days: {n_days}")
    n_segments = df_kage.shape[0]
    print(f"Number of 1-hour segments: {n_segments}")

    kage_start_datetime = pd.Timestamp(
        df_kage["start_datetime"].iloc[0], unit=TIME_PRECISION
    )

    ds_segments = []  # List of xarray Datasets for each 1-hour segment

    previous_segment_end = kage_start_datetime

    for date, hour in df_kage.index:
        # Load the pose data for the current 1-hour segment
        poses = load_poses.from_file(
            df_kage.loc[(date, hour), "pose_file_path"],
            source_software="DeepLabCut",
            fps=FPS,
        )

        # Assert that length of tracks matches the number of video frames
        n_frames = df_kage.loc[(date, hour), "n_frames"]
        assert poses.sizes["time"] == n_frames, (
            f"Number of tracked timepoints ({poses.sizes['time']}) does not "
            f"match the number of frames ({n_frames}) for {date} at {hour}!"
        )

        # Load datetime timestamps from file (in ISO format)
        timestamps = (
            pd.read_csv(
                df_kage.loc[(date, hour), "timestamps_file_path"],
                header=None,
                index_col=False,
                parse_dates=[0],
            )
            .iloc[:, 0]
            .values
        )

        # assign time coordinates to the actual datetime timestamps
        poses = poses.assign_coords(time=timestamps)
        poses.attrs["time_unit"] = f"datetime64[{TIME_PRECISION}]"

        # If more than 1 second has passed since the previous segment ended
        # Mark the first time point as NaN to indicate a data gap.
        gap_tolerance = pd.Timedelta("1s")
        if pd.Timestamp(timestamps[0]) - previous_segment_end > gap_tolerance:
            poses.loc[{"time": timestamps[0]}] = np.nan

        previous_segment_end = timestamps[-1]

        # add to list of loaded segments
        ds_segments.append(poses)

    # Combine all segments into a single xarray Dataset
    ds_kage = xr.concat(ds_segments, dim="time")
    ds_kage.attrs["kage"] = kage
    ds_kage.attrs["kage_start_datetime"] = kage_start_datetime.isoformat()

    # Ensure the concatenated timestamps are monotonic increasing
    assert _is_monotonic_increasing(ds_kage.time.values), (
        f"Combined timestamps for {kage} are not monotonic increasing!"
    )

    # Assign secondary time coordinate as seconds elapsed since kage start
    seconds_since_kage_start = (
        ds_kage.time.data - np.datetime64(kage_start_datetime)
    ) / pd.Timedelta("1s")
    ds_kage = ds_kage.assign_coords(
        seconds_elapsed=("time", seconds_since_kage_start)
    )

    # load image to use as background frame
    video_path = df_kage.iloc[n_segments // 2]["video_file_path"]
    background_img = load_background_frame(
        video_path=video_path, i=0, n_average=100
    )
    print(f"Loaded background image for {kage} from {video_path} \n")

    return ds_kage, background_img

## Save tracking data from each kage to a netCDF files

We first create a combined `movement` dataset for each kage.
We also assign a background image to each kage, which is used for visualisation purposes.

Then, we save the combined `movement` datasets for each kage to a netCDF file.
We also save the background image for each kage in the same directory.

In [12]:
OVERWRITE_EXISTING = False  # Set this to True to overwrite existing files

In [14]:
for kage in tqdm(df.index.get_level_values("kage").unique()):
    # Path to save the netCDF files and background image.
    kage_dir = analysis_dir / kage
    kage_dir.mkdir(parents=True, exist_ok=True)
    ds_file_path = kage_dir / f"{kage}.nc"
    img_file_path = kage_dir / f"{kage}_background.png"

    if ds_file_path.exists() and not OVERWRITE_EXISTING:
        print(
            f"Found existing netCDF file for {kage} at {ds_file_path}. "
            f"Will not overwrite."
        )
        continue
    else:
        # Create movement dataset for each kage
        ds, img = kage_to_movement_ds(df, kage)
        print(f"Converted data from {kage} to a 'movement' xarray Dataset.")

        # Save the dataset to a NetCDF file
        ds.to_netcdf(
            ds_file_path, mode="w", engine="netcdf4", format="NETCDF4"
        )
        print(f"Dataset for {kage} saved to {ds_file_path}.")

        # Save the background image
        plt.imsave(img_file_path, img)
        print(f"Background image for {kage} saved to {img_file_path}.\n")

        # Free up memory
        del ds, img

100%|██████████| 33/33 [00:00<00:00, 11081.83it/s]

Found existing netCDF file for kage1 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage1/kage1.nc. Will not overwrite.
Found existing netCDF file for kage10 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage10/kage10.nc. Will not overwrite.
Found existing netCDF file for kage11 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage11/kage11.nc. Will not overwrite.
Found existing netCDF file for kage12 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage12/kage12.nc. Will not overwrite.
Found existing netCDF file for kage13 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage13/kage13.nc. Will not overwrite.
Found existing netCDF file for kage14 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage14/kage14.nc. Will not overwrite.
Found existing netCDF file for kage15 at /mnt/Data/Smart-Kages/2024-04-Apr-May-Downs/movement_analysis/kage15/kage15.nc. Will not overwrite.
Found existing n


