# Parse data structure into a pandas DataFrame
Parse data paths from the Smart-Kages folder structure and store them in a pandas DataFrames.

Also, load time adjustments to help estimate start and end times for each 1-hour segment.

In [5]:
from pathlib import Path

import numpy as np
import pandas as pd
import sleap_io as sio

from smart_kages_movement.datetime import find_segment_overlaps
from smart_kages_movement.io import (
    adjust_start_datetimes,
    parse_data_into_df,
)

## Configuration
Define some global variables and paths.

In [8]:
FPS = 2  # Frames per second for all videos

# Path to the data directory containing all kages.
data_dir = Path.home() / "UCL Dropbox" / "Loukia Katsouri" / "DataProtocolsEquipment" / "SmartKages" /"1.Analysis_DS_Apr-May2024" / "RawData_300525"
assert data_dir.exists(), f"Data directory {data_dir} does not exist."



# Path for saving the processed data (in this case, a subfolder of 'data_dir').
save_dir = data_dir / "movement_analysis"
save_dir.mkdir(parents=True, exist_ok=True)

In [None]:
print(f"{data_dir} contains the following files:")
for file in data_dir.iterdir():
    print(file.name)

/Users/loukia/UCL Dropbox/Loukia Katsouri/DataProtocolsEquipment/SmartKages/1.Analysis_DS_Apr-May2024/RawData_300525 contains the following files:
kage7
kage25
kage22
kage14
kage9
kage13
kage12
kage15
kage8
Csv files with Tasks
kage23
kage1
kage6
kage24
.DS_Store
kage30
kage31
movement_analysis
kage21
kage3
kage4
kage26
kage19
kage10
kage17
kage28
kage16
kage29
kage11
kage5
kage27
kage18
kage20
kage2
kage33
previousaKages
kage32


## Aggregate segment paths into a single dataframe

The data is stored per Smart-Kage, in folders names as `kageN`, e.g. `kage1`, `kage2`, etc.

Each Smart-Kage folder contains:
- daily videos are stored in `videos/YYYY/MM/DD/`, split into 1-hour segments. Each 1-hour segment is an `.mp4` file named `kageN_YYYYMMDD_HHMMSS.mp4`.
- corresponding DeepLabCut (DLC) predictions are stored in `analysis/dlc_output/YYYY/MM/DD/`. Each 1-hour `.h5` file therein is prefixed with `kageN_YYYYMMDD_HHMMSS`.

Let's parse the relevant parts of the data structure into a single dataframe.

In [10]:
df = parse_data_into_df(data_dir)
df.head()

Found 33 kage directories:  kage1 kage10 kage11 kage12 kage13 kage14 kage15 kage16 kage17 kage18 kage19 kage2 kage20 kage21 kage22 kage23 kage24 kage25 kage26 kage27 kage28 kage29 kage3 kage30 kage31 kage32 kage33 kage4 kage5 kage6 kage7 kage8 kage9
Found a total of 25658 .h5 pose files output by DLC.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_datetime,pose_file_path,video_file_path
kage,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
kage1,20240403,9,2024-04-03 09:54:20,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,10,2024-04-03 10:00:02,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,11,2024-04-03 11:01:03,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,12,2024-04-03 12:01:04,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,13,2024-04-03 13:01:03,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...


## Add some video metadata
We reach each video's `n_frames`, `height`, `width`, and `n_channels` from the video file itself, using `sleap-io`.

These metadata are added as columns to the dataframe.

In [11]:
video_shapes = pd.DataFrame(
    np.zeros((len(df), 4), dtype=int),
    index=df.index,
    columns=["n_frames", "height", "width", "n_channels"],
)

for idx, row in df.iterrows():
    video_path = row["video_file_path"]
    video = sio.load_video(video_path)  # Lazy-Load the video using sleap_io
    # Extract video shape information
    video_shapes.loc[idx, "n_frames"] = video.shape[0]
    video_shapes.loc[idx, "height"] = video.shape[1]
    video_shapes.loc[idx, "width"] = video.shape[2]
    video_shapes.loc[idx, "n_channels"] = (
        video.shape[3] if len(video.shape) > 3 else 1
    )
    video.close()  # Close the video to free resources

# Concatenate the video shapes with the original DataFrame
df = pd.concat([df, video_shapes], axis=1)

Let's see if there are any videos where `n_channels` is not 3, which would indicate a non-RGB video.

In [12]:
df_non_rgb = df[df["n_channels"] != 3]
print(f"Founcd {len(df_non_rgb)} non-RGB videos.")
df_non_rgb.head(len(df_non_rgb))

Founcd 25 non-RGB videos.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_datetime,pose_file_path,video_file_path,n_frames,height,width,n_channels
kage,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
kage1,20240403,11,2024-04-03 11:01:03,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7064,376,500,1
kage1,20240404,4,2024-04-04 04:01:04,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7023,376,500,1
kage1,20240404,6,2024-04-04 06:01:03,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7019,376,500,1
kage18,20240403,11,2024-04-03 11:15:38,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,5303,376,500,1
kage2,20240508,18,2024-05-08 18:00:01,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7191,376,500,1
kage2,20240508,19,2024-05-08 19:00:02,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7189,376,500,1
kage2,20240508,20,2024-05-08 20:00:02,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7189,376,500,1
kage2,20240509,10,2024-05-09 10:00:02,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7189,376,500,1
kage2,20240509,11,2024-05-09 11:00:02,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7182,376,500,1
kage2,20240509,13,2024-05-09 13:00:01,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7191,376,500,1


## Adjust datetimes for each segment
Above, we derived the start datetime of each video from its filename.
However, this is usually not the exact time when the video started recording.

We can find the exact start time for each video using the `adjustments.txt` file.
There is one such file per day, stored in `kageN/videos/YYYY/MM/DD/`.

`adjustments.txt` contains one row per video file, formatted as `video_file:H,M,S`.
For example, this could be `kage1_20240420_000002.mp4:0,0,6`.

The hours, minutes, and seconds represent the offset relative to time `00:00:00` of that day.
Negative values are possible, e.g. `kage3_20240425_070002.mp4:7,-1,21`
*probably* means the video started at `06:59:21` on that day.

In [13]:
df = adjust_start_datetimes(df)

# For each kage, the start_datetime values should be monotonic increasing
for kage in df.index.get_level_values("kage").unique():
    kage_start_datetimes = df.loc[kage, "start_datetime"].values
    assert np.all(np.diff(kage_start_datetimes) >= pd.Timedelta(0)), (
        f"Start datetimes for kage {kage} are not monotonic increasing."
    )

df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_datetime,pose_file_path,video_file_path,n_frames,height,width,n_channels
kage,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
kage1,20240403,9,2024-04-03 09:54:24,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,665,376,500,3
kage1,20240403,10,2024-04-03 10:00:06,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7183,376,500,3
kage1,20240403,11,2024-04-03 11:01:07,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7064,376,500,1
kage1,20240403,12,2024-04-03 12:01:08,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7059,376,500,3
kage1,20240403,13,2024-04-03 13:01:07,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7058,376,500,3
kage1,20240403,14,2024-04-03 14:01:07,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7062,376,500,3
kage1,20240403,15,2024-04-03 15:01:08,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7061,376,500,3
kage1,20240403,16,2024-04-03 16:01:07,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7061,376,500,3
kage1,20240403,17,2024-04-03 17:01:07,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7060,376,500,3
kage1,20240403,18,2024-04-03 18:01:07,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,7065,376,500,3


Based on the `n_frames` and the `fps` we can estimate each segment's `duration`.

Adding the `duration` to the `start_datetime`, we can get an estimate for `end_datetime`.

In [14]:
FPS = 2  # frames per second
df["duration"] = df["n_frames"] * pd.Timedelta(1 / FPS, "sec")

# Ensure that no segment lasts longer than an hour
assert df["duration"].max() <= pd.Timedelta(1, "hour"), (
    "Some segments last longer than an hour, which is unexpected."
)
# Calculate end datetime based on start datetime and duration
df["end_datetime"] = df["start_datetime"] + df["duration"]

# Reorder columns for readability
df = df[
    [
        "start_datetime",
        "end_datetime",
        "duration",
        "n_frames",
        "n_channels",
        "height",
        "width",
        "pose_file_path",
        "video_file_path",
    ]
]
df.head(500)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_datetime,end_datetime,duration,n_frames,n_channels,height,width,pose_file_path,video_file_path
kage,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kage1,20240403,09,2024-04-03 09:54:24,2024-04-03 09:59:56.500,0 days 00:05:32.500000,665,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,10,2024-04-03 10:00:06,2024-04-03 10:59:57.500,0 days 00:59:51.500000,7183,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,11,2024-04-03 11:01:07,2024-04-03 11:59:59.000,0 days 00:58:52,7064,1,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,12,2024-04-03 12:01:08,2024-04-03 12:59:57.500,0 days 00:58:49.500000,7059,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240403,13,2024-04-03 13:01:07,2024-04-03 13:59:56.000,0 days 00:58:49,7058,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,...,...,...,...,...,...,...,...,...,...,...
kage1,20240428,08,2024-04-28 08:00:06,2024-04-28 09:00:00.000,0 days 00:59:54,7188,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240428,09,2024-04-28 09:00:06,2024-04-28 10:00:00.500,0 days 00:59:54.500000,7189,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240428,10,2024-04-28 10:00:06,2024-04-28 11:00:00.500,0 days 00:59:54.500000,7189,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...
kage1,20240428,11,2024-04-28 11:00:06,2024-04-28 12:00:00.500,0 days 00:59:54.500000,7189,3,376,500,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...,/Users/loukia/UCL Dropbox/Loukia Katsouri/Data...


Let's find out if there are any overlapping segments based on the estimated `start_datetime` and `end_datetime`.

In [15]:
overlaps = find_segment_overlaps(df[1:26000])  # Exclude the first row for testing
overlaps

Found 2 overlapping segments.


Unnamed: 0,segment_A,segment_B,end_A,start_B,overlap_duration
0,"(kage3, 20240425, 06)","(kage3, 20240425, 07)",2024-04-25 06:59:52.500,2024-04-25 06:59:21,00:00:31.500000
1,"(kage3, 20240504, 06)","(kage3, 20240504, 07)",2024-05-04 06:59:39.000,2024-05-04 06:59:21,00:00:18


## Save the dataframes as CSV files
Finally, we save both the `df` and the `overlaps` dataframes to CSV files for later use.

In [16]:
df_file_path = save_dir / "all_segments.csv"
overlaps_file_path = save_dir / "segment_overlaps.csv"
df_non_rgb_file_path = save_dir / "non_rgb_segments.csv"
# Save the DataFrame and overlaps to CSV files
df_non_rgb.to_csv(df_non_rgb_file_path)
df.to_csv(df_file_path)
overlaps.to_csv(overlaps_file_path)
print(f"Dataframe saved to {df_file_path}")
print(f"Overlaps saved to {overlaps_file_path}")
print(f"Non-RGB segments saved to {df_non_rgb_file_path}")

Dataframe saved to /Users/loukia/UCL Dropbox/Loukia Katsouri/DataProtocolsEquipment/SmartKages/1.Analysis_DS_Apr-May2024/RawData_300525/movement_analysis/all_segments.csv
Overlaps saved to /Users/loukia/UCL Dropbox/Loukia Katsouri/DataProtocolsEquipment/SmartKages/1.Analysis_DS_Apr-May2024/RawData_300525/movement_analysis/segment_overlaps.csv
Non-RGB segments saved to /Users/loukia/UCL Dropbox/Loukia Katsouri/DataProtocolsEquipment/SmartKages/1.Analysis_DS_Apr-May2024/RawData_300525/movement_analysis/non_rgb_segments.csv
