In [1]:
import h5py
import random


In [2]:

def make_small_h5(input_path, output_path, sample_size=500, group_name="dataset2", seed=42):
    random.seed(seed)

    with h5py.File(input_path, "r") as fin:
        with h5py.File(output_path, "w") as fout:
            # Crear estructura
            fin_group = fin[group_name]
            fout_group = fout.create_group(group_name)
            fout_k = fout_group.create_group("keypoints")
            fout_e = fout_group.create_group("embeddings")
            fout_l = fout_group.create_group("labels") if "labels" in fin_group else None

            # Obtener clips válidos
            clips = list(fin_group["keypoints"].keys())
            random.shuffle(clips)
            selected = clips[:sample_size]

            for clip in selected:
                # Copiar keypoints
                data = fin_group["keypoints"][clip][:]
                fout_k.create_dataset(clip, data=data, compression="gzip")

                # Copiar embedding
                emb = fin_group["embeddings"][clip][:]
                fout_e.create_dataset(clip, data=emb)

                # Copiar label si hay
                if fout_l:
                    label = fin_group["labels"][clip][:]
                    fout_l.create_dataset(clip, data=label)

    print(f"✅ Guardado {sample_size} clips en '{output_path}'")

# Ejemplo de uso:
# make_small_h5("dataset_grande.h5", "dataset_pequeño.h5", sample_size=300)


In [3]:
def make_small_h5_ratio(input_path, output_path, sample_ratio=0.05, group_name="dataset2", seed=42):
    """
    Crea un .hdf5 reducido tomando una fracción aleatoria de los clips.
    - sample_ratio: proporción de clips a conservar (ej: 0.05 para 5%)
    """
    import h5py, random

    random.seed(seed)

    with h5py.File(input_path, "r") as fin:
        with h5py.File(output_path, "w") as fout:
            fin_group = fin[group_name]
            fout_group = fout.create_group(group_name)
            fout_k = fout_group.create_group("keypoints")
            fout_e = fout_group.create_group("embeddings")
            fout_l = fout_group.create_group("labels") if "labels" in fin_group else None

            # Lista de clips
            clips = list(fin_group["keypoints"].keys())
            total = len(clips)
            n = max(1, int(total * sample_ratio))

            print(f"➡️  Tomando {n} de {total} clips ({sample_ratio*100:.1f}%)")

            selected = random.sample(clips, n)

            for clip in selected:
                fout_k.create_dataset(clip, data=fin_group["keypoints"][clip][:], compression="gzip")
                fout_e.create_dataset(clip, data=fin_group["embeddings"][clip][:])
                if fout_l:
                    fout_l.create_dataset(clip, data=fin_group["labels"][clip][:])

    print(f"✅ Guardado en '{output_path}'")


In [4]:

data_path = "../../../../data/dataset.hdf5"
sample_ratio = 25
output_path = f"../../../../data/dataset_small{sample_ratio}.hdf5"

#make_small_h5(data_path, output_path, sample_size=sample_size, group_name="dataset2", seed=42)
make_small_h5_ratio(data_path, output_path, sample_ratio=sample_ratio/100, group_name="dataset2", seed=42)


➡️  Tomando 2114 de 8459 clips (25.0%)


✅ Guardado en '../../../../data/dataset_small25.hdf5'


# clean dataset 

In [None]:
def fix_keypoinys(keypoints):
    T, N, _ = keypoints.shape
    filtered = keypoints[:, 117:, :].clone()  # Clonar para evitar modificar el original
    return filtered


In [10]:
from torch import as_tensor
import torch

def clean_fn(keypoints_np):
    keypoints = as_tensor(keypoints_np, dtype=torch.float32)
    cleaned = fix_keypoinys(keypoints)
    return cleaned.numpy()


In [None]:
def make_clean_h5_all_groups(input_path, output_path, clean_keypoints_fn=None):
    """
    Crea un nuevo archivo HDF5 copiando todos los grupos y clips del original,
    aplicando una función de limpieza a los keypoints.

    - clean_keypoints_fn: función que recibe un array (T, J, 2) y retorna el keypoint limpio.
    """
    import h5py

    with h5py.File(input_path, "r") as fin:
        with h5py.File(output_path, "w") as fout:
            group_names = list(fin.keys())
            print(f"📁 Grupos encontrados: {group_names}")

            for group_name in group_names:
                print(f"\n➡️ Procesando grupo '{group_name}'...")
                fin_group = fin[group_name]
                fout_group = fout.create_group(group_name)

                fout_k = fout_group.create_group("keypoints")
                fout_e = fout_group.create_group("embeddings")
                fout_l = fout_group.create_group("labels") if "labels" in fin_group else None

                clips = list(fin_group["keypoints"].keys())
                print(f"  🧩 {len(clips)} clips")

                for clip in clips:
                    keypoints = fin_group["keypoints"][clip][:]
                    if clean_keypoints_fn:
                        keypoints = clean_keypoints_fn(keypoints)
                        print(f"  🔧 Keypoints limpiados para clip '{clip} con")

                    fout_k.create_dataset(clip, data=keypoints, compression="gzip")
                    fout_e.create_dataset(clip, data=fin_group["embeddings"][clip][:])
                    if fout_l:
                        fout_l.create_dataset(clip, data=fin_group["labels"][clip][:])

    print(f"\n✅ Dataset limpio guardado en: {output_path}")


In [None]:

make_clean_h5_all_groups(
    input_path="../../../../data/dataset1.hdf5",
    output_path="../../../../data/dataset_clean.hdf5",
    clean_keypoints_fn=clean_fn
)


📁 Grupos encontrados: ['dataset1', 'dataset2', 'dataset3', 'dataset4', 'dataset5', 'dataset6', 'dataset7']

➡️ Procesando grupo 'dataset1'...
  🧩 3200 clips
