# Build Datasets

### Handle imports

In [1]:
import sys
import os
import time
# Reloader
import importlib

# Assuming the notebook is in the "notebooks" folder, we go one level up to include the "pose_parser" package.
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if parent_directory not in sys.path:
    sys.path.append(parent_directory)

import pose_parser.jobs.process_videos_job as pv
import pose_parser.jobs.build_and_format_dataset_job as data_builder 



2023-06-19 13:09:07.330066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### I/O File Locations

Relative to this directory ./pose_parser/notebooks

In [2]:
# The location of video files
source_videos_directory = "../../source_videos"

# The location of Dataloop Annotations corresponding to each video
source_annotations_directory = "../../source_annotations"

# The location to output sequence data
sequence_data_directory = "../../data/sequences"

# The location to output keypoint data
keypoints_data_directory = "../../data/keypoints"

# The location to output datasets
merged_annotation_output_directory = "../../data/annotated_videos"

### Generate Keypoint and Sequence Data

In [None]:
importlib.reload(pv)

# give a timestamped folder to not overwrite
folder = f"run-preproccessed-{time.time_ns()}"  
keypoints_path = f"{keypoints_data_directory}/{folder}"
sequence_path = f"{sequence_data_directory}/{folder}"

pv.ProcessVideosJob().process_videos(
    src_videos_path=source_videos_directory,
    output_keypoints_data_path=keypoints_path,
    output_sequence_data_path=sequence_path,
    write_keypoints_to_file=True,
    write_serialized_sequence_to_file=True,
    limit=None,
    configuration={},
    preprocess_video=True,
    return_output=False
)

# Datasets from Keypoints + Sequence Data

### Generate Dataset

TODO enumerate all the options here... 


#### Labels
```
step_type
weight_transfer_type
```
#### Segmentation Strategy
```
flatten_into_columns
flatten_on_example
split_on_label
window
none
```


### Split on Step Type, pooled temporal dynamics with angles and distances

In [None]:
importlib.reload(data_builder)

# sequence generated on 3/24/23 with new normalized values
# sequence_data_directory= "../../data/sequences/run-1679706020787139000"

# sequence generated on 3/24/23 with new normalized values

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=None,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="pooled_angles_distances_full_examples"
)

### Split on Step Type, pooled temporal dynamics with angles and distances, only the last 10 frames

In [None]:
importlib.reload(data_builder)

# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="pooled_angles_distances_last_10_frames"
)

### Flatten columns over 10 frame window on step type (arbitrary start / end)

In [None]:
importlib.reload(data_builder)

# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=True,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="split_on_label",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="pooled_angles_distances_last_10_frames"
)

### Flatten on a 10 frame window based on complete training examples (the end of the example will flatten the previous 10 frames into a training row0)

In [None]:
importlib.reload(data_builder)

# sequence generated on 3/24/23 with new normalized values
sequence_data_directory= "../../data/sequences/run-1679706020787139000"

db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="flatten_on_example",
    segmentation_splitter_label="step_type",
    segmentation_window=10,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="flatten_on_example_10_frames_2"
)

### Flatten on a 25 frame window based on complete training examples (the end of the example will flatten the previous 25 frames into a training row)

In [None]:
importlib.reload(data_builder)

# sequence generated on 3/24/23 with new normalized values
# sequence_data_directory= "../../data/sequences/run-1679706020787139000"
sequence_data_directory = "../../data/sequences/run-preproccessed-1680117203184086000"


db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="flatten_on_example",
    segmentation_splitter_label="step_type",
    segmentation_window=25,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="preprocessed_flatten_on_example_25_frames_2"
)

### Dataset with all frames as rows

In [None]:
sequence_data_directory = "../../data/sequences/run-preproccessed-1680117203184086000"
db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=True,
    include_distances=True,
    include_normalized=True,
    segmentation_strategy="none",
    segmentation_splitter_label="step_type",
    segmentation_window=25,
    segmentation_window_label="weight_transfer_type",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="preprocessed_all_rows-5-9-23.csv",
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_angles=False,
    include_distances=False,
    include_normalized=False,
    segmentation_strategy="none",
)

### Dataset with raw x, y, z joint data, with one frame per row

In [3]:
importlib.reload(data_builder)

sequence_data_directory = "../../data/sequences/run-preproccessed-1680117203184086000"
db = data_builder.BuildAndFormatDatasetJob()
dataset = db.build_dataset_from_data_files(
    annotations_data_directory=source_annotations_directory,
    sequence_data_directory=sequence_data_directory,
    limit=None,
)

formatted_dataset = db.format_dataset(
    dataset=dataset,
    pool_frame_data_by_clip=False,
    decimal_precision=4,
    include_unlabeled_data=True,
    include_joints=True,
    include_z_axis=True,
    include_angles=False,
    include_distances=False,
    include_normalized=False,
    segmentation_strategy="none",
)

db.write_dataset_to_csv(
    csv_location=merged_annotation_output_directory,
    formatted_dataset=formatted_dataset,
    filename="preprocessed_frame_joint_data-6-19-23"
)

True