# Build Datasets

### Handle imports

In [1]:
import sys
import os
import time

# Assuming the notebook is in the "notebooks" folder, we go one level up to include the "stream_pose_ml" package.
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if parent_directory not in sys.path:
    sys.path.append(parent_directory)

import stream_pose_ml.jobs.process_videos_job as pv
import stream_pose_ml.jobs.build_and_format_dataset_job as data_builder 



### I/O File Locations

Relative to this directory ./stream_pose_ml/notebooks

In [2]:
# The location of video files
source_videos_directory = "../../data/source_videos"

# The location of Dataloop Annotations corresponding to each video
source_annotations_directory = "../../data/source_annotations"

# The location to output sequence data
sequence_data_directory = "../../data/sequences"

# The location to output keypoint data
keypoints_data_directory = "../../data/keypoints"

# The location to output datasets
merged_annotation_output_directory = "../../data/annotated_videos"

### Generate Keypoint and Sequence Data

In [3]:
# give a timestamped folder to not overwrite
# folder = f"run-preproccessed-{time.time_ns()}"  
folder = "run-preproccessed-1697924524790004000"
keypoints_path = f"{keypoints_data_directory}/{folder}"
sequence_path = f"{sequence_data_directory}/{folder}"

In [11]:
pv.ProcessVideosJob().process_videos(
    src_videos_path=source_videos_directory,
    output_keypoints_data_path=keypoints_path,
    output_sequence_data_path=sequence_path,
    write_keypoints_to_file=True,
    write_serialized_sequence_to_file=True,
    limit=None,
    configuration={},
    preprocess_video=True,
    return_output=False
)

Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.27_1230pm_BW_Front_P5.webm_sequence.json.
1/86 completed: IKF_8.27_1230pm_BW_Front_P5.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.27_11am_BW_Front_P4.webm_sequence.json.
2/86 completed: IKF_8.27_11am_BW_Front_P4.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IRC_9.11_2pm_BW_Front_P63.webm_sequence.json.
3/86 completed: IRC_9.11_2pm_BW_Front_P63.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.31_1030am_BW_Front_P26.webm_sequence.json.
4/86 completed: IKF_8.31_1030am_BW_Front_P26.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.31_12pm_BW_Front_P30.webm_sequence.json.
5/86 completed: IKF_8.31_12pm_BW_Front_P30.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_9.3_230pm_BW_Front_P47.webm_sequence.json.
6/8

[matroska,webm @ 0x56d484400] File ended prematurely


Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/WLR_9.1.1pm_BW_Front_P42.webm_sequence.json.
65/86 completed: WLR_9.1.1pm_BW_Front_P42.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.27_230pm_BW_Front2_P10.webm_sequence.json.
66/86 completed: IKF_8.27_230pm_BW_Front2_P10.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_9.3_230pm_BW_Front_P49.webm_sequence.json.
67/86 completed: IKF_9.3_230pm_BW_Front_P49.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.27_11am_BW_Front3_P1.webm_sequence.json.
68/86 completed: IKF_8.27_11am_BW_Front3_P1.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.31_1030am_BW_Front_P28.webm_sequence.json.
69/86 completed: IKF_8.31_1030am_BW_Front_P28.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IRC_9.11_2pm_BW_Front2_P62.webm_sequence.js

[matroska,webm @ 0x5b3551600] File ended prematurely at pos. 524288 (0x80000)


Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/WLR_9.8_10am_BW_Front_P54.webm_sequence.json.
73/86 completed: WLR_9.8_10am_BW_Front_P54.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/WLR_9.1_230pm_BW_Front_P44.webm_sequence.json.
74/86 completed: WLR_9.1_230pm_BW_Front_P44.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/WLR_9.8_1130am_BW_Front_P40.webm_sequence.json.
75/86 completed: WLR_9.8_1130am_BW_Front_P40.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.27_11am_BW_Front2_P1.webm_sequence.json.
76/86 completed: IKF_8.27_11am_BW_Front2_P1.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IRC_9.11_2pm_BW_Front2_P63.webm_sequence.json.
77/86 completed: IRC_9.11_2pm_BW_Front2_P63.webm.
Successfully wrote ../../data/sequences/run-preproccessed-1697924524790004000/IKF_8.27_230pm_BW_Front4_P9.webm_sequence.json.

{'keypoints_path': '../../data/keypoints/run-preproccessed-1697924524790004000',
 'sequence_path': '../../data/sequences/run-preproccessed-1697924524790004000'}

# Datasets from Keypoints + Sequence Data

### Generate Dataset

TODO enumerate all the options here... 


#### Labels
```
step_type
weight_transfer_type
```
#### Segmentation Strategy
```
flatten_into_columns
flatten_on_example
split_on_label
window
none
```


### Split on Step Type, pooled temporal dynamics with angles and distances

In [4]:
# Specify the location of the sequence data you ran above (the sequence folder path)
sequence_data_directory = sequence_path

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=None,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="pooled_angles_distances_full_examples"
)

KeyError for key: nose_neck_to_plumb_line.angle_2d_degrees
KeyError for key: neck_mid_hip_to_plumb_line.angle_2d_degrees
KeyError for key: neck_right_shoulder_to_plumb_line.angle_2d_degrees
KeyError for key: neck_left_shoulder_to_plumb_line.angle_2d_degrees
KeyError for key: right_shoulder_right_elbow_to_plumb_line.angle_2d_degrees
KeyError for key: right_elbow_right_wrist_to_plumb_line.angle_2d_degrees
KeyError for key: left_shoulder_left_elbow_to_plumb_line.angle_2d_degrees
KeyError for key: mid_hip_right_hip_to_plumb_line.angle_2d_degrees
KeyError for key: right_hip_right_knee_to_plumb_line.angle_2d_degrees
KeyError for key: right_knee_right_ankle_to_plumb_line.angle_2d_degrees
KeyError for key: mid_hip_left_hip_to_plumb_line.angle_2d_degrees
KeyError for key: left_hip_left_knee_to_plumb_line.angle_2d_degrees
KeyError for key: left_knee_left_ankle_to_plumb_line.angle_2d_degrees
KeyError for key: nose_right_eye_to_plumb_line.angle_2d_degrees
KeyError for key: right_eye_right_ear_to_p

KeyError: 'nose_neck_to_plumb_line.angle_2d_degrees'

### Split on Step Type, pooled temporal dynamics with angles and distances, only the last 10 frames

In [None]:
# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="pooled_angles_distances_last_10_frames"
)

### Flatten columns over 10 frame window on step type (arbitrary start / end)

In [None]:
# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="pooled_angles_distances_last_10_frames"
)

### Flatten on a 10 frame window based on complete training examples (the end of the example will flatten the previous 10 frames into a training row0)

In [8]:
# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="flatten_on_example",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="flatten_on_example_10_frames_2"
)

True

### Flatten on a 25 frame window based on complete training examples (the end of the example will flatten the previous 25 frames into a training row)

In [None]:
# sequence generated on 3/24/23 with new normalized values
# sequence_data_directory= "../../data/sequences/run-1679706020787139000"
sequence_data_directory = "../../data/sequences/run-preproccessed-1680117203184086000"


db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="flatten_on_example",
    segmentation_splitter_label="step_type",
    segmentation_window=25,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="preprocessed_flatten_on_example_25_frames_2"
)

### Dataset with all frames as rows

In [None]:
sequence_data_directory = "../../data/sequences/run-preproccessed-1680117203184086000"
db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="none",
    segmentation_splitter_label="step_type",
    segmentation_window=25,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="preprocessed_all_rows-5-9-23.csv",
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=False,
    include_distances=False,
    include_normalized=False,
    segmentation_strategy="none",
)

### Dataset with raw x, y, z joint data, with one frame per row

In [3]:
sequence_data_directory = "../../data/sequences/run-preproccessed-1680117203184086000"
db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_joints=True,
    include_z_axis=True,
    include_angles=False,
    include_distances=False,
    include_normalized=False,
    segmentation_strategy="none",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="preprocessed_frame_joint_data-6-19-23"
)

True