# 01 – Data Preparation: Cutting and Labeling Penalty Videos

In this notebook, we extract individual penalty kick videos from long match footage (e.g. 'all Ronaldo penalties') using a timecode file and generate labeled clips like `penalty_ronaldo_12_g.mp4`, `penalty_messi_03_d.mp4`, etc.

Input: all videos in `not_treated/` (named like `penaltys_ronaldo.mp4`, `penaltys_messi.mp4`, etc.)
Timecodes: in `timecodes.txt`, must be in same folder as each video
Output: videos saved in `treated/` with standardized names

In [None]:
import os
import subprocess
import pandas as pd

# Chemins de base
base_path = r"D:/malo/Documents/projets/penalty_prediction/penalty_dataset"
not_treated_dir = os.path.join(base_path, "not treated")
treated_dir = os.path.join(base_path, "treated")
os.makedirs(treated_dir, exist_ok=True)

In [None]:
def read_timecodes(txt_file_path):
    timecodes = []
    with open(txt_file_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 3 and parts[0].count(":") == 3:
                start, end, direction = parts
                timecodes.append((start, end, direction))
    return timecodes

def timecode_to_seconds(tc):
    h, m, s, ms = map(int, tc.split(":"))
    return h * 3600 + m * 60 + s + ms / 1000

In [None]:
# Traitement de toutes les vidéos longues dans not_treated
all_labels = []

for video_file in os.listdir(not_treated_dir):
    if video_file.endswith(".mp4") and video_file.startswith("penaltys_"):
        video_path = os.path.join(not_treated_dir, video_file)
        player = video_file.replace("penaltys_", "").replace(".mp4", "")
        txt_path = video_path.replace(".mp4", ".txt")

        if not os.path.exists(txt_path):
            print(f"Pas de fichier timecode pour {video_file}, ignoré.")
            continue

        timecodes = read_timecodes(txt_path)
        print(f"🎞️  {len(timecodes)} penaltys trouvés pour {player}")

        for idx, (start, end, direction) in enumerate(timecodes):
            start_sec = timecode_to_seconds(start) + 0.2
            end_sec = timecode_to_seconds(end) + 0.8

            output_filename = f"penalty_{player}_{idx+1:02d}_{direction}.mp4"
            output_path = os.path.join(treated_dir, output_filename)

            cmd = [
                "ffmpeg", "-i", video_path,
                "-ss", str(start_sec), "-to", str(end_sec),
                "-c:v", "libx264", "-c:a", "aac", "-strict", "experimental",
                "-y", output_path
            ]
            subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

            all_labels.append({"filename": output_filename, "label": direction})

In [None]:
# Enregistrement du CSV des labels
df = pd.DataFrame(all_labels)
csv_path = os.path.join(base_path, "penalty_labels.csv")
df.to_csv(csv_path, index=False)
print("CSV des labels généré :", csv_path)
df.head()