# 03 – Build Dataset for LSTM Training

In this notebook, we load the keypoints extracted by YOLOv7-Pose from each frame of each penalty video, aggregate them into temporal sequences, and assign them a class label (g/m/d).

 Input: keypoint `.txt` files from `yolov7/runs/pose/<video_name>/labels/`

 Labels: from `penalty_labels.csv`

 Output: NumPy arrays (X, y) ready for LSTM training

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

# Chemins
yolo_runs_dir = r"D:/malo/Documents/yolov7/runs/pose"
labels_csv = r"D:/malo/Documents/projets/penalty_prediction/penalty_dataset/penalty_labels.csv"

# Chargement des labels
df_labels = pd.read_csv(labels_csv)
df_labels.head()

In [None]:
# Fonction pour charger les keypoints depuis un dossier /labels/
def load_keypoints_from_folder(label_folder):
    keypoints_seq = []
    txt_files = sorted(Path(label_folder).glob("*.txt"))

    for txt_file in txt_files:
        with open(txt_file, 'r') as f:
            lines = f.readlines()
            if not lines:
                continue
            coords = list(map(float, lines[0].strip().split()[1:]))  # sauter le class_id
            keypoints_seq.append(coords)

    return np.array(keypoints_seq)  # shape: (timesteps, features)

In [None]:
# Construction du dataset complet
X_list, y_list = [], []

for _, row in df_labels.iterrows():
    filename = row['filename'].replace(".mp4", "")
    label = row['label']
    label_folder = os.path.join(yolo_runs_dir, filename, "labels")

    if not os.path.exists(label_folder):
        print(f"Dossier manquant pour {filename}, ignoré.")
        continue

    seq = load_keypoints_from_folder(label_folder)
    if len(seq) == 0:
        continue
    X_list.append(seq)
    y_list.append(label)

In [None]:
# Unifier les tailles avec padding (max length)
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = pad_sequences(X_list, padding='post', dtype='float32')
label_map = {'g': 0, 'm': 1, 'd': 2}
y = np.array([label_map[l] for l in y_list])

print("Dataset prêt :")
print("X shape:", X.shape)  # (n_samples, timesteps, features)
print("y shape:", y.shape)