In [1]:
# Essential imports
import os
import sys
import time
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
import requests
from huggingface_hub import HfApi

# Add the lerobot folder to Python path (moved outside project)
# Use current working directory to find the lerobot path
current_dir = Path(os.getcwd())
lerobot_path = current_dir.parent.parent / "lerobot"

if lerobot_path.exists():
    sys.path.insert(0, str(lerobot_path))
    print(f"Added lerobot path: {lerobot_path}")
else:
    print(f"Warning: lerobot path not found at {lerobot_path}")
    # Try alternative path
    alt_path = current_dir.parent / "lerobot"
    if alt_path.exists():
        sys.path.insert(0, str(alt_path))
        print(f"Found lerobot at alternative path: {alt_path}")
    else:
        print(f"Alternative path also not found: {alt_path}")

# LeRobot specific imports  
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata

  from .autonotebook import tqdm as notebook_tqdm


Added lerobot path: /Users/michelmeyer/Library/CloudStorage/Dropbox/Dev/lerobot


In [2]:
def get_lerobot_datasets():
    """Get list of all LeRobot datasets from HuggingFace Hub"""
    api = HfApi()
    
    try:
        datasets = api.list_datasets(author="lerobot", limit=None)
        dataset_names = [dataset.id for dataset in datasets]
        return sorted(dataset_names)
    except Exception as e:
        print(f"Error fetching datasets: {e}")
        return []

print("Discovering LeRobot datasets...")
available_datasets = get_lerobot_datasets()

print(f"Found {len(available_datasets)} datasets:")
for i, dataset_name in enumerate(available_datasets[:10]):  # Show first 10
    print(f"  {i+1:2d}. {dataset_name}")

if len(available_datasets) > 10:
    print(f"  ... and {len(available_datasets) - 10} more")

# Analyze dataset families
print(f"\nDataset Family Analysis:")
families = {}
for dataset in available_datasets:
    family = dataset.replace("lerobot/", "").split('_')[0]
    families[family] = families.get(family, 0) + 1

sorted_families = sorted(families.items(), key=lambda x: x[1], reverse=True)
for family, count in sorted_families[:10]:  # Show top 10 families
    print(f"  {family}: {count} dataset(s)")

Discovering LeRobot datasets...
Found 103 datasets:
   1. lerobot/aloha_mobile_cabinet
   2. lerobot/aloha_mobile_chair
   3. lerobot/aloha_mobile_elevator
   4. lerobot/aloha_mobile_shrimp
   5. lerobot/aloha_mobile_wash_pan
   6. lerobot/aloha_mobile_wipe_wine
   7. lerobot/aloha_sim_insertion_human
   8. lerobot/aloha_sim_insertion_human_image
   9. lerobot/aloha_sim_insertion_scripted
  10. lerobot/aloha_sim_insertion_scripted_image
  ... and 93 more

Dataset Family Analysis:
  aloha: 29 dataset(s)
  berkeley: 8 dataset(s)
  xarm: 8 dataset(s)
  utokyo: 5 dataset(s)
  koch: 4 dataset(s)
  libero: 4 dataset(s)
  unitreeh1: 4 dataset(s)
  austin: 3 dataset(s)
  cmu: 3 dataset(s)
  dlr: 3 dataset(s)


In [3]:
def get_dataset_metadata(dataset_name):
    """Get metadata without loading the full dataset"""
    try:
         
        # Rate limiting
        time.sleep(1.0)
        
        metadata = LeRobotDatasetMetadata(dataset_name)
        
        return {
            'dataset_name': dataset_name,
            'category': dataset_name.replace("lerobot/", "").split('_')[0],
            'num_episodes': metadata.total_episodes,
            'total_samples': metadata.total_frames,
            'fps': metadata.fps,
            'robot_type': metadata.robot_type,
            'camera_keys': metadata.camera_keys,
            'features': list(metadata.features.keys()),
            'has_images': len(metadata.camera_keys) > 0  
        }
        
    except Exception as e:
        print(f"    Error: {e}")
        return {
            'dataset_name': dataset_name,
            'category': dataset_name.replace("lerobot/", "").split('_')[0],
            'num_episodes': 'Error',
            'total_samples': 'Error',
            'fps': 'Error',
            'robot_type': 'Error',
            'camera_keys': [],
            'features': [],
            'has_images': False
        }

In [4]:
# Aggressive output clearing and fresh restart
import sys
from IPython.display import clear_output

# Clear all output and restart
clear_output(wait=True)

# Reset any problematic variables
if 'dataset_metrics' in globals():
    del dataset_metrics
if 'df_metrics' in globals():
    del df_metrics

print("=== FRESH START ===")
print("Collecting metadata from all datasets...")
print("This may take a few minutes...")

# Initialize completely fresh list
dataset_metrics = []

# Process all datasets
datasets_to_process = available_datasets

for i, dataset_name in enumerate(datasets_to_process):
    print(f"Processing {i+1}/{len(datasets_to_process)}: {dataset_name}")
    
    metadata = get_dataset_metadata(dataset_name)
    if metadata:
        dataset_metrics.append(metadata)
        
    # Brief pause every 5 datasets
    if (i + 1) % 5 == 0:
        print(f"-------- Taking a break...")
        time.sleep(2)

# Create DataFrame
df_metrics = pd.DataFrame(dataset_metrics)

print(f"\n=== PROCESSING COMPLETE ===")
print(f"Collected metadata for {len(df_metrics)} datasets")
print(f"Successfully processed: {len([d for d in dataset_metrics if d['num_episodes'] != 'Error'])}")

=== FRESH START ===
Collecting metadata from all datasets...
This may take a few minutes...
Processing 1/103: lerobot/aloha_mobile_cabinet
Processing 2/103: lerobot/aloha_mobile_chair
Processing 3/103: lerobot/aloha_mobile_elevator
Processing 4/103: lerobot/aloha_mobile_shrimp
Processing 5/103: lerobot/aloha_mobile_wash_pan
-------- Taking a break...
Processing 6/103: lerobot/aloha_mobile_wipe_wine
Processing 7/103: lerobot/aloha_sim_insertion_human
Processing 8/103: lerobot/aloha_sim_insertion_human_image
Processing 9/103: lerobot/aloha_sim_insertion_scripted
Processing 10/103: lerobot/aloha_sim_insertion_scripted_image
-------- Taking a break...
Processing 11/103: lerobot/aloha_sim_transfer_cube_human
Processing 12/103: lerobot/aloha_sim_transfer_cube_human_image
Processing 13/103: lerobot/aloha_sim_transfer_cube_scripted
Processing 14/103: lerobot/aloha_sim_transfer_cube_scripted_image
Processing 15/103: lerobot/aloha_static_battery
    Error: 
The dataset you requested (lerobot/alo

The dataset you requested (lerobot/berkeley_cable_routing) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/berkeley_cable_routing
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



-------- Taking a break...
Processing 36/103: lerobot/berkeley_fanuc_manipulation
Processing 37/103: lerobot/berkeley_gnm_cory_hall


The dataset you requested (lerobot/berkeley_gnm_cory_hall) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/berkeley_gnm_cory_hall
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 38/103: lerobot/berkeley_gnm_recon


The dataset you requested (lerobot/berkeley_gnm_recon) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/berkeley_gnm_recon
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 39/103: lerobot/berkeley_gnm_sac_son


The dataset you requested (lerobot/berkeley_gnm_sac_son) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/berkeley_gnm_sac_son
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 40/103: lerobot/berkeley_mvp
-------- Taking a break...
Processing 41/103: lerobot/berkeley_rpt
Processing 42/103: lerobot/cmu_franka_exploration_dataset
Processing 43/103: lerobot/cmu_play_fusion
Processing 44/103: lerobot/cmu_stretch
Processing 45/103: lerobot/columbia_cairlab_pusht_real
-------- Taking a break...
Processing 46/103: lerobot/conq_hose_manipulation


The dataset you requested (lerobot/conq_hose_manipulation) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/conq_hose_manipulation
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 47/103: lerobot/dlr_edan_shared_control
Processing 48/103: lerobot/dlr_sara_grid_clamp
Processing 49/103: lerobot/dlr_sara_pour
Processing 50/103: lerobot/droid_100
-------- Taking a break...
Processing 51/103: lerobot/fmb


The dataset you requested (lerobot/fmb) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/fmb
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 52/103: lerobot/iamlab_cmu_pickup_insert
Processing 53/103: lerobot/imperialcollege_sawyer_wrist_cam
Processing 54/103: lerobot/jaco_play
Processing 55/103: lerobot/kaist_nonprehensile
-------- Taking a break...
Processing 56/103: lerobot/koch_pick_place_1_lego
    Error: 
The dataset you requested (lerobot/koch_pick_place_1_lego) is in 1.6 format.

We introduced a new format since v2.0 which is not backward compatible with v1.x.
Please, use our conversion script. Modify the following command with your own task description:
```
python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \
    --repo-id lerobot/koch_pick_place_1_lego \
    --single-task "TASK DESCRIPTION."  # <---- /!\ Replace TASK DESCRIPTION /!\
```

A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", "Insert the
peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", "Open the top
cabinet, store the pot inside it then close t

The dataset you requested (lerobot/libero_10_image) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/libero_10_image
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



-------- Taking a break...
Processing 61/103: lerobot/libero_goal_image


The dataset you requested (lerobot/libero_goal_image) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/libero_goal_image
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 62/103: lerobot/libero_object_image


The dataset you requested (lerobot/libero_object_image) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/libero_object_image
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 63/103: lerobot/libero_spatial_image


The dataset you requested (lerobot/libero_spatial_image) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/libero_spatial_image
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 64/103: lerobot/metaworld_mt50


The dataset you requested (lerobot/metaworld_mt50) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/metaworld_mt50
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 65/103: lerobot/metaworld_mt50_push_v2_image


The dataset you requested (lerobot/metaworld_mt50_push_v2_image) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/metaworld_mt50_push_v2_image
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



-------- Taking a break...
Processing 66/103: lerobot/nyu_door_opening_surprising_effectiveness
Processing 67/103: lerobot/nyu_franka_play_dataset
Processing 68/103: lerobot/nyu_rot_dataset
Processing 69/103: lerobot/pusht
Processing 70/103: lerobot/pusht_image
-------- Taking a break...
Processing 71/103: lerobot/pusht_keypoints


The dataset you requested (lerobot/pusht_keypoints) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/pusht_keypoints
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 72/103: lerobot/roboturk


The dataset you requested (lerobot/roboturk) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/roboturk
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 73/103: lerobot/stanford_hydra_dataset
Processing 74/103: lerobot/stanford_kuka_multimodal_dataset
Processing 75/103: lerobot/stanford_robocook
-------- Taking a break...
Processing 76/103: lerobot/taco_play
Processing 77/103: lerobot/test
    Error: 
The dataset you requested (lerobot/test) is in 1.5 format.

We introduced a new format since v2.0 which is not backward compatible with v1.x.
Please, use our conversion script. Modify the following command with your own task description:
```
python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \
    --repo-id lerobot/test \
    --single-task "TASK DESCRIPTION."  # <---- /!\ Replace TASK DESCRIPTION /!\
```

A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", "Insert the
peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", "Open the top
cabinet, store the pot inside it then close the cabinet.", "Push the T-shaped block onto the T-shaped
t

The dataset you requested (lerobot/utokyo_xarm_pick_and_place) is in 2.0 format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id=lerobot/utokyo_xarm_pick_and_place
```

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).



Processing 95/103: lerobot/viola
    Error: 
The dataset you requested (lerobot/viola) is in 1.6 format.

We introduced a new format since v2.0 which is not backward compatible with v1.x.
Please, use our conversion script. Modify the following command with your own task description:
```
python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \
    --repo-id lerobot/viola \
    --single-task "TASK DESCRIPTION."  # <---- /!\ Replace TASK DESCRIPTION /!\
```

A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", "Insert the
peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", "Open the top
cabinet, store the pot inside it then close the cabinet.", "Push the T-shaped block onto the T-shaped
target.", "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.", "Fold the
sweatshirt.", ...

If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/inv

In [7]:
print("Dataset Analysis Summary")
print("=" * 50)

# Basic statistics
successful_df = df_metrics[df_metrics['num_episodes'] != 'Error']
total_datasets = len(df_metrics)
successful_loads = len(successful_df)
datasets_with_images = len(df_metrics[df_metrics['has_images'] == True])

print(f"Dataset Overview:")
print(f"  Total Processed: {total_datasets}")
print(f"  Successfully Loaded: {successful_loads}")
print(f"  With Camera Data: {datasets_with_images}")

if len(successful_df) > 0:
    total_episodes = successful_df['num_episodes'].sum()
    total_samples = successful_df['total_samples'].sum()
    print(f"  Total Episodes: {total_episodes:,}")
    print(f"  Total Samples: {total_samples:,}")

# Category breakdown
print(f"\nCategory Summary:")
category_summary = df_metrics.groupby('category').agg({
    'dataset_name': 'count',
    'has_images': lambda x: sum(1 for val in x if val == True)
}).rename(columns={'dataset_name': 'total', 'has_images': 'with_cameras'})

category_summary = category_summary.sort_values('total', ascending=False)
print(category_summary.head(10).to_string())

# Show sample of datasets with cameras
if datasets_with_images > 0:
    print(f"\nSample datasets with cameras:")
    sample_with_cameras = df_metrics[df_metrics['has_images'] == True].head(5)
    for _, row in sample_with_cameras.iterrows():
        dataset_short = row['dataset_name'].replace('lerobot/', '')
        cameras = len(row['camera_keys'])
        episodes = row['num_episodes'] if row['num_episodes'] != 'Error' else 'Unknown'
        print(f"  {dataset_short}: {episodes} episodes, {cameras} cameras")

Dataset Analysis Summary
Dataset Overview:
  Total Processed: 103
  Successfully Loaded: 92
  With Camera Data: 91
  Total Episodes: 65,318
  Total Samples: 7,823,344

Category Summary:
           total  with_cameras
category                      
aloha         29            25
berkeley       8             8
xarm           8             8
utokyo         5             5
koch           4             0
unitreeh1      4             4
libero         4             4
austin         3             3
stanford       3             3
cmu            3             3

Sample datasets with cameras:
  aloha_mobile_cabinet: 85 episodes, 3 cameras
  aloha_mobile_chair: 55 episodes, 3 cameras
  aloha_mobile_elevator: 20 episodes, 3 cameras
  aloha_mobile_shrimp: 18 episodes, 3 cameras
  aloha_mobile_wash_pan: 50 episodes, 3 cameras


In [8]:
def download_episode_videos(datasets_df, videos_folder="videos", max_datasets=None, max_workers=3):
    """Download episode videos from datasets with parallel processing"""
    
    print(f"Starting video download process...")
    
    # Filter to datasets with images only
    datasets_with_images = datasets_df[
        (datasets_df['has_images'] == True) & 
        (datasets_df['num_episodes'] != 'Error')
    ].copy()
    
    if max_datasets:
        datasets_with_images = datasets_with_images.head(max_datasets)
    
    print(f"Target: {len(datasets_with_images)} datasets with cameras")
    
    # Create videos directory
    os.makedirs(videos_folder, exist_ok=True)
    
    downloaded_videos = []
    failed_videos = []
    
    def download_single_video(args):
        dataset_name, camera_key, dataset_folder = args
        dataset_short = dataset_name.replace('lerobot/', '')
        
        try:
            # Create dataset folder
            os.makedirs(dataset_folder, exist_ok=True)
            
            # Try to download episode 0 video
            video_url = f"https://huggingface.co/datasets/{dataset_name}/resolve/main/videos/chunk-000/{camera_key}/episode_000000.mp4"
            video_path = os.path.join(dataset_folder, f"{camera_key.replace('.', '_')}_episode_000000.mp4")
            
            if os.path.exists(video_path):
                file_size = os.path.getsize(video_path)
                return {
                    'status': 'exists',
                    'dataset': dataset_short,
                    'camera': camera_key,
                    'path': video_path,
                    'size': file_size
                }
            
            # Download the video
            response = requests.get(video_url, timeout=30, stream=True)
            if response.status_code == 200:
                with open(video_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                
                file_size = os.path.getsize(video_path)
                return {
                    'status': 'downloaded',
                    'dataset': dataset_short,
                    'camera': camera_key,
                    'path': video_path,
                    'size': file_size
                }
            else:
                return {
                    'status': 'failed',
                    'dataset': dataset_short,
                    'camera': camera_key,
                    'error': f"HTTP {response.status_code}"
                }
                
        except Exception as e:
            return {
                'status': 'failed',
                'dataset': dataset_short,
                'camera': camera_key,
                'error': str(e)
            }
    
    # Prepare download tasks
    download_tasks = []
    for _, row in datasets_with_images.iterrows():
        dataset_name = row['dataset_name']
        dataset_short = dataset_name.replace('lerobot/', '')
        dataset_folder = os.path.join(videos_folder, dataset_short)
        
        # Download from all cameras
        for camera_key in row['camera_keys']:
            download_tasks.append((dataset_name, camera_key, dataset_folder))
    
    # Execute downloads in parallel
    print(f"Downloading {len(download_tasks)} videos using {max_workers} workers...")
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {executor.submit(download_single_video, task): task for task in download_tasks}
        
        for i, future in enumerate(as_completed(future_to_task)):
            result = future.result()
            
            if result['status'] in ['downloaded', 'exists']:
                downloaded_videos.append(result)
                status_text = "Downloaded" if result['status'] == 'downloaded' else "Found"
                size_mb = result['size'] / (1024*1024)
                print(f"  {status_text}: {result['dataset']}/{result['camera']} ({size_mb:.1f}MB)")
            else:
                failed_videos.append(result)
                print(f"  Failed: {result['dataset']}/{result['camera']}: {result['error']}")
            
            # Progress update
            if (i + 1) % 10 == 0:
                print(f"    Progress: {i+1}/{len(download_tasks)} completed")
    
    # Summary
    print(f"\nDownload Summary:")
    print(f"  Successfully downloaded: {len([v for v in downloaded_videos if v['status'] == 'downloaded'])}")
    print(f"  Already existed: {len([v for v in downloaded_videos if v['status'] == 'exists'])}")
    print(f"  Failed: {len(failed_videos)}")
    
    total_size = sum(v.get('size', 0) for v in downloaded_videos)
    print(f"  Total size: {total_size / (1024*1024):.1f} MB")
    
    return downloaded_videos, failed_videos

print("Video download functions ready!")

Video download functions ready!


In [9]:
# Execute video downloads
if 'df_metrics' in locals() and not df_metrics.empty:
    print("Starting video download process...")
    
    # For full run: remove max_datasets parameter
    downloaded_videos, failed_videos = download_episode_videos(
        df_metrics, 
        videos_folder="videos", 
        max_datasets=None,  # FULL PROCESSING: Download from all datasets with cameras
        max_workers=3
    )
    
    print(f"\nVideo download completed!")
    print(f"Check the 'videos/' folder for downloaded content")
    
else:
    print("No dataset metrics available. Run the previous cells first.")
    downloaded_videos, failed_videos = [], []

Starting video download process...
Starting video download process...
Target: 91 datasets with cameras
Downloading 175 videos using 3 workers...
  Failed: aloha_mobile_cabinet/observation.images.cam_right_wrist: HTTP 404
  Failed: aloha_mobile_cabinet/observation.images.cam_left_wrist: HTTP 404
  Failed: aloha_mobile_cabinet/observation.images.cam_high: HTTP 404
  Failed: aloha_mobile_cabinet/observation.images.cam_right_wrist: HTTP 404
  Failed: aloha_mobile_cabinet/observation.images.cam_left_wrist: HTTP 404
  Failed: aloha_mobile_cabinet/observation.images.cam_high: HTTP 404
  Downloaded: aloha_mobile_chair/observation.images.cam_right_wrist (9.1MB)
  Downloaded: aloha_mobile_chair/observation.images.cam_right_wrist (9.1MB)
  Downloaded: aloha_mobile_chair/observation.images.cam_left_wrist (9.8MB)
  Downloaded: aloha_mobile_chair/observation.images.cam_left_wrist (9.8MB)
  Downloaded: aloha_mobile_elevator/observation.images.cam_high (9.2MB)
  Downloaded: aloha_mobile_elevator/obser

In [10]:
def create_dataset_html_with_videos(datasets_df, downloaded_videos, output_file="lerobot_datasets_videos.html"):
    """Create an interactive HTML page with embedded videos"""
    
    print(f"Creating HTML page: {output_file}")
    
    # Create video lookup
    video_lookup = {}
    for video in downloaded_videos:
        key = f"{video['dataset']}_{video['camera']}"
        video_lookup[key] = video
    
    # Filter successful datasets
    successful_df = datasets_df[datasets_df['num_episodes'] != 'Error'].copy()
    if len(successful_df) > 0:
        successful_df['num_episodes'] = pd.to_numeric(successful_df['num_episodes'])
        successful_df = successful_df.sort_values('num_episodes', ascending=False)
    
    # HTML template
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>LeRobot Datasets - Explorer ({len(successful_df)} datasets)</title>
    <style>
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            max-width: 1400px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
            line-height: 1.6;
        }}
        .header {{
            text-align: center;
            margin-bottom: 40px;
            padding: 30px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border-radius: 10px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        }}
        .stats {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
            margin: 30px 0;
        }}
        .stat-card {{
            background: white;
            padding: 20px;
            border-radius: 10px;
            text-align: center;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }}
        .stat-number {{
            font-size: 2em;
            font-weight: bold;
            color: #667eea;
        }}
        .dataset-grid {{
            display: grid;
            gap: 25px;
            grid-template-columns: repeat(auto-fit, minmax(500px, 1fr));
        }}
        .dataset-card {{
            background: white;
            border-radius: 12px;
            padding: 25px;
            box-shadow: 0 3px 12px rgba(0,0,0,0.1);
            transition: transform 0.2s;
        }}
        .dataset-card:hover {{
            transform: translateY(-2px);
            box-shadow: 0 5px 20px rgba(0,0,0,0.15);
        }}
        .dataset-name {{
            font-size: 1.4em;
            font-weight: bold;
            margin-bottom: 15px;
            color: #2c3e50;
        }}
        .dataset-stats {{
            margin: 15px 0;
            color: #666;
        }}
        .category-tag {{
            display: inline-block;
            background: #e3f2fd;
            color: #1976d2;
            padding: 4px 12px;
            border-radius: 20px;
            font-size: 0.8em;
            font-weight: bold;
            margin-right: 10px;
        }}
        .video-section {{
            margin-top: 20px;
        }}
        .video-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 15px;
            margin-top: 15px;
        }}
        .video-container {{
            text-align: center;
        }}
        .camera-label {{
            font-weight: bold;
            margin-bottom: 8px;
            color: #495057;
            font-size: 0.9em;
        }}
        video {{
            width: 100%;
            max-width: 300px;
            height: auto;
            border-radius: 8px;
            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
        }}
        .no-videos {{
            background: #e9ecef;
            color: #6c757d;
            padding: 30px 20px;
            border-radius: 8px;
            text-align: center;
            font-style: italic;
        }}
        .video-link {{
            display: block;
            margin-top: 5px;
            font-size: 0.8em;
            color: #667eea;
            text-decoration: none;
        }}
        .video-link:hover {{
            text-decoration: underline;
        }}
    </style>
</head>
<body>
    <div class="header">
        <h1>LeRobot Datasets Explorer</h1>
        <p>Interactive exploration of {len(successful_df)} robot learning datasets</p>
        <p style="opacity: 0.9; font-size: 0.9em;">Generated on {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    </div>
"""
    
    # Statistics section
    if len(successful_df) > 0:
        total_episodes = successful_df['num_episodes'].sum()
        total_samples = successful_df['total_samples'].sum() if 'total_samples' in successful_df.columns else 0
        datasets_with_videos = len(set(v['dataset'] for v in downloaded_videos))
        
        html_content += f"""    <div class="stats">
        <div class="stat-card">
            <div class="stat-number">{len(successful_df)}</div>
            <div>Datasets</div>
        </div>
        <div class="stat-card">
            <div class="stat-number">{int(total_episodes):,}</div>
            <div>Episodes</div>
        </div>
        <div class="stat-card">
            <div class="stat-number">{int(total_samples):,}</div>
            <div>Samples</div>
        </div>
        <div class="stat-card">
            <div class="stat-number">{datasets_with_videos}</div>
            <div>With Videos</div>
        </div>
    </div>

    <div class="dataset-grid">
"""
    
    # Generate dataset cards
    for _, row in successful_df.iterrows():
        dataset_name = row['dataset_name']
        dataset_short = dataset_name.replace('lerobot/', '')
        
        episodes = int(row['num_episodes']) if row['num_episodes'] != 'Error' else 'Unknown'
        samples = int(row['total_samples']) if row['total_samples'] != 'Error' else 'Unknown'
        
        # Find videos for this dataset
        dataset_videos = [v for v in downloaded_videos if v['dataset'] == dataset_short]
        
        # Video section
        video_section_html = ""
        if dataset_videos:
            video_section_html = '<div class="video-grid">'
            for video in dataset_videos:
                camera_display = video['camera'].replace('observation.images.', '').replace('.', ' ')
                relative_path = f"videos/{video['dataset']}/{os.path.basename(video['path'])}"
                file_size = video.get('size', 0) / (1024*1024)
                
                video_section_html += f"""
                <div class="video-container">
                    <div class="camera-label">Camera: {camera_display}</div>
                    <video class="video-player" controls preload="metadata">
                        <source src="{relative_path}" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                    <a href="{relative_path}" target="_blank" class="video-link">Download ({file_size:.1f} MB)</a>
                </div>
                """
            video_section_html += '</div>'
        else:
            video_section_html = '<div class="no-videos">No local videos available for this dataset</div>'
        
        html_content += f"""        <div class="dataset-card">
            <div class="dataset-name">{dataset_short}</div>
            <div class="dataset-stats">
                <span class="category-tag">{row['category']}</span>
                <strong>{episodes:,}</strong> episodes • <strong>{samples:,}</strong> samples
            </div>
            <div class="video-section">
                {video_section_html}
            </div>
        </div>
"""
    
    # Close HTML
    html_content += """    </div>
    
    <div style="text-align: center; margin-top: 40px; color: #7f8c8d;">
        <p>Generated from LeRobot Dataset Exploration Notebook</p>
        <p><a href="https://github.com/huggingface/lerobot" target="_blank">LeRobot GitHub</a> • 
           <a href="https://lerobot.huggingface.co" target="_blank">Documentation</a></p>
        <p><small>Videos stored locally for offline browsing</small></p>
    </div>
</body>
</html>"""
    
    # Write HTML file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    file_path = Path(output_file).resolve()
    print(f"HTML file created: {output_file}")
    print(f"Open in browser: file://{file_path}")
    
    return output_file

print("HTML generation function ready!")

HTML generation function ready!


In [11]:
# Generate the HTML page
if 'df_metrics' in locals() and 'downloaded_videos' in locals():
    print("Generating final HTML page...")
    
    html_file = create_dataset_html_with_videos(
        df_metrics, 
        downloaded_videos, 
        output_file="lerobot_datasets_videos.html"
    )
    
    # Try to open in browser
    import webbrowser
    try:
        abs_path = Path(html_file).resolve()
        webbrowser.open(f'file://{abs_path}')
        print(f"Opened in browser automatically")
    except Exception as e:
        print(f"Manual link: file://{Path(html_file).resolve()}")
    
    # Final statistics
    print(f"\nFinal Project Statistics:")
    print(f"  Datasets processed: {len(df_metrics)}")
    print(f"  Videos downloaded: {len(downloaded_videos)}")
    print(f"  HTML file: {html_file}")
    
else:
    print("Missing data. Please run all previous cells first.")

Generating final HTML page...
Creating HTML page: lerobot_datasets_videos.html
HTML file created: lerobot_datasets_videos.html
Open in browser: file:///Users/michelmeyer/Library/CloudStorage/Dropbox/Dev/LeRobotLab/notebooks/lerobot_datasets_videos.html
Opened in browser automatically

Final Project Statistics:
  Datasets processed: 103
  Videos downloaded: 144
  HTML file: lerobot_datasets_videos.html


In [None]:
print("PROJECT COMPLETE!")
print("=" * 50)
print("LeRobot Dataset Explorer successfully created!")
print(f"Open lerobot_datasets_videos.html to explore {len(df_metrics) if 'df_metrics' in locals() else 'all'} datasets")
print(f"Videos stored in the 'videos/' folder")
print("Happy robot learning!")
print("End of notebook execution.")