# Load and preprocess data using `movement`
## Imports

In [8]:
import os
from pathlib import Path

from matplotlib import pyplot as plt

from movement.io import load_poses, save_poses
from movement.utils.reports import report_nan_values

# Use function from utils.py, located within the same directory
from utils import reshape_loaded_ds, clean_data

## Define and create paths
First let's find the data folder on the current machine and check its contents.

In [9]:
# Exchange the path to the data folder with the correct one on your system
data_folder = Path("/Users/nsirmpilatze/Data/in2research2024")
assert data_folder.exists()  # Will raise an error if the path does not exist
print(f"Data will be loaded from {data_folder}")

# The following resident mouse IDs must be present as subfolders in the data folder
resident_ids = ["SB019", "SB021"]
for id in resident_ids:
    assert (data_folder / id).exists()
    print(f"Subfolder {id} has been found")

Data will be loaded from /Users/nsirmpilatze/Data/in2research2024
Subfolder SB019 has been found
Subfolder SB021 has been found


Now let's create subfolders for saving cleaned data, plots and reports.

In [10]:
clean_data_folder = data_folder / "clean_data"
clean_data_folder.mkdir(exist_ok=True)
print(f"Cleaned data will be saved in {clean_data_folder}")

plot_folder = data_folder / "plots"
plot_folder.mkdir(exist_ok=True)
print(f"Plots will be saved in {plot_folder}")

report_folder = data_folder / "reports"
report_folder.mkdir(exist_ok=True)
print(f"Reports will be saved in {report_folder}")

Cleaned data will be saved in /Users/nsirmpilatze/Data/in2research2024/clean_data
Plots will be saved in /Users/nsirmpilatze/Data/in2research2024/plots
Reports will be saved in /Users/nsirmpilatze/Data/in2research2024/reports


## Define metadata

Define the video file names for each mouse pair, and the corresponding time intervals during which both mice were present in the arena. The times are given in seconds.

In [5]:
data = {
    "SB019_female4": dict(
        file_name="220719_SB019_FM001_female4_2022-07-19-181533DLC_resnet50_shanice_allNov29shuffle1_196000_filtered.csv",
        time_on=30,
        time_off=330,
    ),
    "SB019_male2": dict(
        file_name="220719_SB019_FM001_male2_2022-07-19-172457DLC_resnet50_shanice_allNov29shuffle1_196000_filtered.csv",
        time_on=29,
        time_off=323,
    ),
    "SB021_female2": dict(
        file_name="220804_SB021_FM001_female2_2022-08-04-223620DLC_resnet50_shanice_allNov29shuffle1_196000_filtered.csv",
        time_on=40,
        time_off=350,
    ),
    "SB021_male1": dict(
        file_name="220804_SB021_FM001_male1_2022-08-04-215616DLC_resnet50_shanice_allNov29shuffle1_196000_filtered.csv",
        time_on=31,
        time_off=334,
    ),
}

Define the names of the tracked individuals and keypoints.

In [6]:
individuals = ["resident", "intruder"]
keypoint_names = [
    "nose",
    "leftear",
    "rightear", 
    "butt",
    "neck",
    "lefthip",
    "righthip",
    "leftshoulder", 
    "rightshoulder",
    "lowerback",
]

## Clean the data using a pipeline

In [7]:
for pair_name, pair_dict in data.items():
    # Check if the file exists
    resident_id, intruder_id = pair_name.split("_")
    file_path = data_folder / resident_id / pair_dict["file_name"]
    assert file_path.is_file()
    print(f"Processing data for {resident_id} and {intruder_id}")

    # Load the data
    ds_raw = load_poses.from_dlc_file(file_path, fps=50)
    print("Data has been loaded successfully.")

    # Reshape the data into a multi-individual dataset
    ds = reshape_loaded_ds(ds_raw, individuals, keypoint_names)
    print(f"Data has been reshaped successfully into a dataset with two individuals: {individuals}")

    # Select the interval of interest
    ds  = ds.sel(time=slice(pair_dict["time_on"], pair_dict["time_off"]))
    print(f"Selected interval from {pair_dict['time_on']} to {pair_dict['time_off']} seconds.")

    # Clean the data with a combination of confidence thresholding, interpolation, and smoothing
    ds_clean = clean_data(
        ds,
        confidence_threshold=0.8,
        interp_max_gap=25,
        smooth_window_size=7,
        smooth_min_periods=2,
    )
    # Save the cleaned data to a new CSV file
    clean_file_path = clean_data_folder / f"{pair_name}_clean.csv"
    if clean_file_path.exists():
        os.remove(clean_file_path)
    save_poses.to_dlc_file(
        ds_clean, clean_file_path, split_individuals=False,
    )
    # Generate a report on the number of NaN values in the cleaned dataset
    # and save it to a text file
    nan_report = report_nan_values(ds_clean["position"], f"clean data for {pair_name}")
    with open(report_folder /  f"{pair_name}_clean_nan_report.txt", "w") as f:
        f.write(nan_report)
    print("Filtered data and saved cleaned dataset.")

    # Generate time series plots for all keypoints
    for kpt_to_plot  in ds.keypoints.values:
        # plot raw position over time for a given keypoint
        ds["position"].sel(keypoints=kpt_to_plot).plot.line(
            x="time", hue="individuals", row="space", aspect=5, size=2.5
        )
        plt.savefig(plot_folder / f"{resident_id}_{intruder_id}_{kpt_to_plot}_position_raw_plot.png")
        plt.close()
        # plot cleaned position over time for a given keypoint
        ds_clean["position"].sel(keypoints=kpt_to_plot).plot.line(
            x="time", hue="individuals", row="space", aspect=5, size=2.5
        )
        plt.savefig(plot_folder / f"{resident_id}_{intruder_id}_{kpt_to_plot}_position_clean_plot.png")
        plt.close()
        # plot confidence over time for a given keypoint
        ds_clean["confidence"].sel(keypoints=kpt_to_plot).plot.line(
            x="time", hue="individuals", aspect=5, size=2.5
        )
        plt.savefig(plot_folder / f"{resident_id}_{intruder_id}_{kpt_to_plot}_confidence_plot.png")
        plt.close()
    print("Diagnostic plots have been generated and saved to disk.")


    print(f"Finished processing data for {resident_id} and {intruder_id}.\n")

Processing data for SB019 and female4
Data has been loaded successfully.
Data has been reshaped successfully into a dataset with two individuals: ['resident', 'intruder']
Selected interval from 30 to 330 seconds.


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Filtered data and saved cleaned dataset.
Diagnostic plots have been generated and saved to disk.
Finished processing data for SB019 and female4.

Processing data for SB019 and male2
Data has been loaded successfully.
Data has been reshaped successfully into a dataset with two individuals: ['resident', 'intruder']
Selected interval from 29 to 323 seconds.
Filtered data and saved cleaned dataset.
Diagnostic plots have been generated and saved to disk.
Finished processing data for SB019 and male2.

Processing data for SB021 and female2
Data has been loaded successfully.
Data has been reshaped successfully into a dataset with two individuals: ['resident', 'intruder']
Selected interval from 40 to 350 seconds.
Filtered data and saved cleaned dataset.
Diagnostic plots have been generated and saved to disk.
Finished processing data for SB021 and female2.

Processing data for SB021 and male1
Data has been loaded successfully.
Data has been reshaped successfully into a dataset with two individua