# Build Datasets

### Handle imports

In [1]:
import sys
import os
import time
# Reloader
import importlib

# Assuming the notebook is in the "notebooks" folder, we go one level up to include the "pose_parser" package.
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if parent_directory not in sys.path:
    sys.path.append(parent_directory)

import pose_parser.jobs.process_videos_job as pv
import pose_parser.jobs.build_and_format_dataset_job as data_builder 



2023-03-24 19:21:11.600262: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### I/O File Locations

Relative to this directory ./pose_parser/notebooks

In [2]:
# The location of video files
source_videos_directory = "../../source_videos"

# The location of Dataloop Annotations corresponding to each video
source_annotations_directory = "../../source_annotations"

# The location to output sequence data
sequence_data_directory = "../../data/sequences"

# The location to output keypoint data
keypoints_data_directory = "../../data/keypoints"

# The location to output datasets
merged_annotation_output_directory = "../../data/annotated_videos"

### Generate Keypoint and Sequence Data

In [6]:
importlib.reload(pv)

# give a timestamped folder to not overwrite
folder = f"run-{time.time_ns()}"  
keypoints_path = f"{keypoints_data_directory}/{folder}"
sequence_path = f"{sequence_data_directory}/{folder}"

pv.ProcessVideosJob().process_videos(
    src_videos_path=source_videos_directory,
    output_keypoints_data_path=keypoints_path,
    output_sequence_data_path=sequence_path,
    write_keypoints_to_file=True,
    write_serialized_sequence_to_file=True,
    limit=None,
    configuration={},
    preprocess_video=True,
    return_output=False
)

Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.27_1230pm_BW_Front_P5.webm_sequence.json.
1/86 completed: IKF_8.27_1230pm_BW_Front_P5.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.27_11am_BW_Front_P4.webm_sequence.json.
2/86 completed: IKF_8.27_11am_BW_Front_P4.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IRC_9.11_2pm_BW_Front_P63.webm_sequence.json.
3/86 completed: IRC_9.11_2pm_BW_Front_P63.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.31_1030am_BW_Front_P26.webm_sequence.json.
4/86 completed: IKF_8.31_1030am_BW_Front_P26.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.31_12pm_BW_Front_P30.webm_sequence.json.
5/86 completed: IKF_8.31_12pm_BW_Front_P30.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_9.3_230pm_BW_Front_P47.webm_sequence.json.
6/86 completed: IKF_9.3_230pm_BW_Front_P47.webm.
Successfully wrote ../../data/sequence

[matroska,webm @ 0x7f7ecba35dc0] File ended prematurely


Successfully wrote ../../data/sequences/run-1679706020787139000/WLR_9.1.1pm_BW_Front_P42.webm_sequence.json.
65/86 completed: WLR_9.1.1pm_BW_Front_P42.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.27_230pm_BW_Front2_P10.webm_sequence.json.
66/86 completed: IKF_8.27_230pm_BW_Front2_P10.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_9.3_230pm_BW_Front_P49.webm_sequence.json.
67/86 completed: IKF_9.3_230pm_BW_Front_P49.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.27_11am_BW_Front3_P1.webm_sequence.json.
68/86 completed: IKF_8.27_11am_BW_Front3_P1.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.31_1030am_BW_Front_P28.webm_sequence.json.
69/86 completed: IKF_8.31_1030am_BW_Front_P28.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IRC_9.11_2pm_BW_Front2_P62.webm_sequence.json.
70/86 completed: IRC_9.11_2pm_BW_Front2_P62.webm.
Successfully wrote ../../data/

[matroska,webm @ 0x7f7e5a233180] File ended prematurely at pos. 524288 (0x80000)


Successfully wrote ../../data/sequences/run-1679706020787139000/WLR_9.8_10am_BW_Front_P54.webm_sequence.json.
73/86 completed: WLR_9.8_10am_BW_Front_P54.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/WLR_9.1_230pm_BW_Front_P44.webm_sequence.json.
74/86 completed: WLR_9.1_230pm_BW_Front_P44.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/WLR_9.8_1130am_BW_Front_P40.webm_sequence.json.
75/86 completed: WLR_9.8_1130am_BW_Front_P40.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.27_11am_BW_Front2_P1.webm_sequence.json.
76/86 completed: IKF_8.27_11am_BW_Front2_P1.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IRC_9.11_2pm_BW_Front2_P63.webm_sequence.json.
77/86 completed: IRC_9.11_2pm_BW_Front2_P63.webm.
Successfully wrote ../../data/sequences/run-1679706020787139000/IKF_8.27_230pm_BW_Front4_P9.webm_sequence.json.
78/86 completed: IKF_8.27_230pm_BW_Front4_P9.webm.
Successfully wrote ../../data/se

{'keypoints_path': '../../data/keypoints/run-1679706020787139000',
 'sequence_path': '../../data/sequences/run-1679706020787139000'}

### Generate Dataset

TODO enumerate all the options here... 


#### Labels
```
step_type
weight_transfer_type
```
#### Segmentation Strategy
```
flatten_into_columns
split_on_label
window
none
```


In [3]:
importlib.reload(data_builder)

# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
)

True