In [1]:
import os
import json
import h5py
import numpy as np

import robomimic
import robomimic.utils.file_utils as FileUtils
import robomimic.utils.env_utils as EnvUtils
import robomimic.utils.obs_utils as ObsUtils
import imageio
import tqdm
from robomimic.utils.file_utils import create_hdf5_filter_key

In [2]:
dataset_path = "/home/carl_lab/data_franka/imgsd_demo/demo.hdf5"

f_org = h5py.File(dataset_path, "r")
demos = list(f_org["data"].keys())

In [3]:
lengths=[]
for demo_name in demos:
    demo=f_org['data'][demo_name]
    num_samples=demo.attrs['num_samples']
    lengths.append(num_samples)

lengths=np.array(lengths)

print('Number of demos: ', len(demos))
print('Max length: ', np.max(lengths))
print('Min length: ', np.min(lengths))
print('Mean length: ', np.mean(lengths))

Number of demos:  82
Max length:  422
Min length:  156
Mean length:  251.4512195121951


In [33]:
import random
mask_keys = list(f_org['mask'].keys())
    
# Create a dictionary to store selected demonstrations
selected_demos = []

# Total demonstrations needed
target_count = 60

# Proportional allocation
demo_counts = {key: len(f_org['mask'][key]) for key in mask_keys}
total_demos = sum(demo_counts.values())
demo_quota = {key: round(target_count * count / total_demos) for key, count in demo_counts.items()}

# Collect demonstrations
for key in mask_keys:
    # Get all demonstrations from this mask
    all_demos = list(f_org['mask'][key])
    
    # Randomly sample required number of demonstrations
    sampled_demos = random.sample(all_demos, min(demo_quota[key], len(all_demos)))
    
    # Append to selected demonstrations
    selected_demos.extend(sampled_demos)
    # Print statements for debugging
    print(f"Processing mask: {key}")
    print(f"Total demos in mask '{key}': {len(all_demos)}")
    print(f"Sampling {len(sampled_demos)} demos from mask '{key}'.")


Processing mask: akash_100
Total demos in mask 'akash_100': 35
Sampling 12 demos from mask 'akash_100'.
Processing mask: marzan_73
Total demos in mask 'marzan_73': 51
Sampling 17 demos from mask 'marzan_73'.
Processing mask: ola_114
Total demos in mask 'ola_114': 94
Sampling 31 demos from mask 'ola_114'.


In [10]:
demos = selected_demos

In [11]:
dataset_path_sub = dataset_path
dataset_path_sub = dataset_path_sub.replace(".hdf5", "")+"_sub60.hdf5"
dataset_path_sub

'/home/carl_lab/data_franka/combined_oma_good_180_sub60.hdf5'

In [12]:
f_sub = h5py.File(dataset_path_sub, "w")

In [13]:
f_sub.create_group("data")
f_sub.create_group("mask")

<HDF5 group "/mask" (0 members)>

In [14]:
def copy_group(src_group, dest_group):
    # Copy attributes
    for attr_name, attr_value in src_group.attrs.items():
        dest_group.attrs[attr_name] = attr_value

    for key, item in src_group.items():
        if isinstance(item, h5py.Group):
            # Create a new group in the destination and recursively copy contents
            new_group = dest_group.create_group(key)
            copy_group(item, new_group)
        elif isinstance(item, h5py.Dataset):
            # Copy datasets
            dataset = dest_group.create_dataset(key, data=item[...])
            # Copy attributes for the dataset
            for attr_name, attr_value in item.attrs.items():
                dataset.attrs[attr_name] = attr_value

In [15]:
def copy_demos(f_src, f_dest, demos2copy, next_id):
    new_demo_names=[]
    i=next_id
    for demo_name in tqdm.tqdm(demos2copy):
        demo=f_src['data'][demo_name]  

        demo_name_new="demo_"+str(i) 
        new_demo=f_dest["data"].create_group(demo_name_new) 
        copy_group(demo, new_demo) 

        new_demo_names.append(demo_name_new)
        i = i+1

    return i , new_demo_names


In [16]:
next_id , new_demo_names=copy_demos(f_src=f_org, f_dest=f_sub, demos2copy=demos, next_id=0)

100%|██████████| 60/60 [00:49<00:00,  1.22it/s]


In [19]:
f_sub['mask'].keys()
mask_keys = f_sub['mask'].keys()
    
# Iterate through each key and count the number of demos
demo_counts = {key: len(f_sub['mask'][key]) for key in mask_keys}

# Print the results
for key, count in demo_counts.items():
    print(f"Mask '{key}' contains {count} demonstrations.")

In [20]:
f_org['mask'].keys()
mask_keys = f_org['mask'].keys()
    
# Iterate through each key and count the number of demos
demo_counts = {key: len(f_org['mask'][key]) for key in mask_keys}

# Print the results
for key, count in demo_counts.items():
    print(f"Mask '{key}' contains {count} demonstrations.")

Mask 'akash_100' contains 35 demonstrations.
Mask 'marzan_73' contains 51 demonstrations.
Mask 'ola_114' contains 94 demonstrations.


In [21]:
f_org.close()
f_sub.close()

check if decreasing size worked

In [22]:
dataset_path_base="/home/carl_lab/data_franka/combined_oma_good_180_sub60.hdf5"

In [23]:
f_base = h5py.File(dataset_path_base, "r")
demos = list(f_base["data"].keys())

lengths=[]
demos_minmax={}
for demo_name in demos:
    demo=f_base['data'][demo_name]
    num_samples=demo.attrs['num_samples']
    lengths.append(num_samples)

    action=f_base['data'][demo_name]['actions']
    action=np.array(action) 
    demos_minmax[demo_name] = (np.min(action, axis=0), np.max(action, axis=0))


lengths=np.array(lengths)

print('Number of demos: ', len(demos))
print('Max length: ', np.max(lengths))
print('Min length: ', np.min(lengths))
print('Mean length: ', np.mean(lengths))
print('Median length: ', np.median(lengths))
print('') 

Number of demos:  60
Max length:  328
Min length:  93
Mean length:  179.48333333333332
Median length:  158.5



In [24]:
f_base['mask'].keys()
mask_keys = f_base['mask'].keys()
    
# Iterate through each key and count the number of demos
demo_counts = {key: len(f_base['mask'][key]) for key in mask_keys}

# Print the results
for key, count in demo_counts.items():
    print(f"Mask '{key}' contains {count} demonstrations.")

In [26]:
f_base['mask'].keys()

<KeysViewHDF5 []>

In [25]:
demo_names = demos[0]
demo = f_base['data'][demo_names]
actions = f_base['data'][demo_names]['actions'][:]
actions.shape

(154, 7)