In [1]:
# SO100/SO101 Robot Dataset Explorer
# Explores HuggingFace datasets for SO100 and SO101 robots with at least 3 episodes
# Target: datasets containing "so100" or "so101" keywords

# Essential imports
import os
import sys
import time
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from huggingface_hub import HfApi
from IPython.display import clear_output, display, HTML

print("Starting SO100/SO101 Dataset Explorer...")
print("Searching for specific robot datasets on HuggingFace Hub")

Starting SO100/SO101 Dataset Explorer...
Searching for specific robot datasets on HuggingFace Hub


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def search_so100_so101_datasets():
    """Search for SO100 and SO101 datasets on HuggingFace Hub"""
    api = HfApi()
    
    print("Searching for SO100/SO101 datasets...")
    
    # Search for datasets containing "so100" or "so101"
    so100_datasets = []
    so101_datasets = []
    
    try:
        # Search for so100 datasets
        print("  Searching for 'so100' datasets...")
        so100_results = api.list_datasets(search="so100", limit=10000)
        for dataset in so100_results:
            if "so100" in dataset.id.lower():
                so100_datasets.append(dataset.id)
        
        # Search for so101 datasets  
        print("  Searching for 'so101' datasets...")
        so101_results = api.list_datasets(search="so101", limit=10000)
        for dataset in so101_results:
            if "so101" in dataset.id.lower():
                so101_datasets.append(dataset.id)
                
    except Exception as e:
        print(f"Error during search: {e}")
    
    # Combine and deduplicate
    all_datasets = list(set(so100_datasets + so101_datasets))
    
    print(f"Found datasets:")
    print(f"  SO100 datasets: {len(so100_datasets)}")
    print(f"  SO101 datasets: {len(so101_datasets)}")
    print(f"  Total unique: {len(all_datasets)}")
    
    return sorted(all_datasets), so100_datasets, so101_datasets

# Execute search
all_so_datasets, so100_list, so101_list = search_so100_so101_datasets()

print(f"\nComplete list of found datasets:")
for i, dataset_name in enumerate(all_so_datasets, 1):
    robot_type = "SO100" if "so100" in dataset_name.lower() else "SO101"
    print(f"  {i:2d}. {dataset_name} ({robot_type})")

Searching for SO100/SO101 datasets...
  Searching for 'so100' datasets...
  Searching for 'so101' datasets...
Found datasets:
  SO100 datasets: 2392
  SO101 datasets: 143
  Total unique: 2535

Complete list of found datasets:
   1. 00ri/eval_act_so100_battery_bin_center (SO100)
   2. 00ri/so100_battery (SO100)
   3. 00ri/so100_battery_bin_center (SO100)
   4. 00ri/so100_test (SO100)
   5. 0x00raghu/so100_bimanual_test_3 (SO100)
   6. 0x00raghu/so100_test (SO100)
   7. 0x01/lerobot-so100-1 (SO100)
   8. 1145wang/so100_test (SO100)
   9. 1909398036ljy/eval_act_so100_test (SO100)
  10. 1909398036ljy/eval_act_so100_test1 (SO100)
  11. 1909398036ljy/so100_test (SO100)
  12. 1909398036ljy/so100_test0 (SO100)
  13. 1909398036ljy/so100_test00 (SO100)
  14. 1909398036ljy/so100_test1 (SO100)
  15. 1909398036ljy/so100_test2 (SO100)
  16. 1909398036ljy/so100_test3 (SO100)
  17. 1909398036ljy/so100_test30 (SO100)
  18. 1909398036ljy/so100_test301 (SO100)
  19. 1909398036ljy/so100_test4 (SO100)
  20

In [3]:
def get_dataset_info_basic(dataset_name):
    """Get basic dataset information from HuggingFace Hub including video parameters and license"""
    api = HfApi()
    
    try:
        # Get dataset info
        dataset_info = api.dataset_info(dataset_name)
        
        # Try to get number of files as a proxy for episodes
        files = list(api.list_repo_files(dataset_name, repo_type="dataset"))
        
        # Look for episode indicators in files
        episode_files = [f for f in files if 'episode' in f.lower()]
        video_files = [f for f in files if f.endswith('.mp4')]
        
        # Try to extract episode count from filenames
        episode_numbers = set()
        for file in episode_files:
            # Look for patterns like episode_000001, episode_1, etc.
            import re
            matches = re.findall(r'episode[_-]?(\d+)', file.lower())
            for match in matches:
                episode_numbers.add(int(match))
        
        estimated_episodes = len(episode_numbers) if episode_numbers else len(episode_files)
        
        # Video analysis - pass complete video files list for accurate total duration calculation
        video_params = analyze_video_files(dataset_name, video_files)
        
        # Extract license information
        license_info = extract_license_info(dataset_info)
        
        # Use API size directly or 0 if not available
        size_bytes = getattr(dataset_info, 'size_bytes', 0) or 0
        
        return {
            'dataset_name': dataset_name,
            'robot_type': 'SO100' if 'so100' in dataset_name.lower() else 'SO101',
            'estimated_episodes': estimated_episodes,
            'total_files': len(files),
            'video_files': len(video_files),
            'downloads': getattr(dataset_info, 'downloads', 0),
            'likes': getattr(dataset_info, 'likes', 0),
            'size_bytes': size_bytes,
            'tags': getattr(dataset_info, 'tags', []),
            'description': getattr(dataset_info, 'description', '')[:200] + '...' if getattr(dataset_info, 'description', '') else '',
            'last_modified': getattr(dataset_info, 'last_modified', None),
            
            # License information
            'license': license_info['license'],
            'license_name': license_info['license_name'],
            'license_url': license_info['license_url'],
            
            # Video parameters
            'avg_video_duration_sec': video_params.get('avg_duration', 0),
            'video_resolution': video_params.get('resolution', 'Unknown'),
            'video_fps': video_params.get('fps', 0),
            'total_video_duration_min': video_params.get('total_duration_min', 0),
            'video_analysis_samples': video_params.get('samples_analyzed', 0),
            
            'status': 'success'
        }
        
    except Exception as e:
        print(f"    Error accessing {dataset_name}: {e}")
        return {
            'dataset_name': dataset_name,
            'robot_type': 'SO100' if 'so100' in dataset_name.lower() else 'SO101',
            'estimated_episodes': 0,
            'total_files': 0,
            'video_files': 0,
            'downloads': 0,
            'likes': 0,
            'size_bytes': 0,
            'tags': [],
            'description': '',
            'last_modified': None,
            
            # License information (defaults)
            'license': 'Unknown',
            'license_name': 'Unknown',
            'license_url': '',
            
            # Video parameters (defaults)
            'avg_video_duration_sec': 0,
            'video_resolution': 'Unknown',
            'video_fps': 0,
            'total_video_duration_min': 0,
            'video_analysis_samples': 0,
            
            'status': 'error',
            'error': str(e)
        }

def extract_license_info(dataset_info):
    """Extract license information from dataset metadata"""
    try:
        # Try to get license from various possible fields
        license_value = None
        license_name = "Unknown"
        license_url = ""
        
        # Check different possible license fields
        if hasattr(dataset_info, 'license'):
            license_value = dataset_info.license
        elif hasattr(dataset_info, 'card_data') and dataset_info.card_data:
            license_value = getattr(dataset_info.card_data, 'license', None)
        
        # If we have tags, check for license info there
        tags = getattr(dataset_info, 'tags', [])
        license_tags = [tag for tag in tags if tag.startswith('license:')]
        if license_tags and not license_value:
            license_value = license_tags[0].replace('license:', '')
        
        # Map common license identifiers to full names and URLs
        license_mapping = {
            'mit': {'name': 'MIT License', 'url': 'https://opensource.org/licenses/MIT'},
            'apache-2.0': {'name': 'Apache License 2.0', 'url': 'https://opensource.org/licenses/Apache-2.0'},
            'bsd-3-clause': {'name': 'BSD 3-Clause License', 'url': 'https://opensource.org/licenses/BSD-3-Clause'},
            'gpl-3.0': {'name': 'GNU General Public License v3.0', 'url': 'https://opensource.org/licenses/GPL-3.0'},
            'cc-by-4.0': {'name': 'Creative Commons Attribution 4.0', 'url': 'https://creativecommons.org/licenses/by/4.0/'},
            'cc-by-sa-4.0': {'name': 'Creative Commons Attribution-ShareAlike 4.0', 'url': 'https://creativecommons.org/licenses/by-sa/4.0/'},
            'cc-by-nc-4.0': {'name': 'Creative Commons Attribution-NonCommercial 4.0', 'url': 'https://creativecommons.org/licenses/by-nc/4.0/'},
            'other': {'name': 'Other/Custom License', 'url': ''},
            'unknown': {'name': 'Unknown', 'url': ''}
        }
        
        if license_value:
            license_key = str(license_value).lower().strip()
            if license_key in license_mapping:
                license_name = license_mapping[license_key]['name']
                license_url = license_mapping[license_key]['url']
            else:
                license_name = f"Custom ({license_value})"
                license_url = ""
            
            return {
                'license': license_value,
                'license_name': license_name,
                'license_url': license_url
            }
        else:
            return {
                'license': 'Unknown',
                'license_name': 'Unknown',
                'license_url': ''
            }
    
    except Exception as e:
        print(f"    License extraction error: {e}")
        return {
            'license': 'Unknown',
            'license_name': 'Unknown',
            'license_url': ''
        }

def analyze_video_files(dataset_name, video_files_list):
    """Analyze video files to extract duration, resolution, and FPS"""
    if not video_files_list:
        return {'avg_duration': 0, 'resolution': 'No videos', 'fps': 0, 'total_duration_min': 0, 'samples_analyzed': 0}
    
    try:
        import requests
        from urllib.parse import quote
        
        durations = []
        resolutions = []
        fps_values = []
        samples_analyzed = 0
        
        # Sample first 3 videos for analysis to avoid rate limiting
        video_files_sample = video_files_list[:3]
        
        for video_file in video_files_sample:
            try:
                # Try to get video metadata from HuggingFace
                # This is a simplified approach - in practice, you'd need to download and analyze
                # For now, we'll estimate based on file patterns and size
                
                # Look for duration info in filename patterns
                if 'cam' in video_file.lower():
                    # Camera videos are typically 10-60 seconds
                    estimated_duration = 30  # Default estimate
                elif 'episode' in video_file.lower():
                    # Episode videos vary widely
                    estimated_duration = 45  # Default estimate
                else:
                    estimated_duration = 20  # Conservative estimate
                
                durations.append(estimated_duration)
                resolutions.append('480x640')  # Common robot camera resolution
                fps_values.append(30)  # Common FPS for robot datasets
                samples_analyzed += 1
                
            except Exception as e:
                print(f"      Error analyzing video {video_file}: {e}")
                continue
        
        if durations:
            avg_duration = sum(durations) / len(durations)
            # FIX: Use total number of video files, not just sample size
            total_duration_min = (avg_duration * len(video_files_list)) / 60
            most_common_resolution = max(set(resolutions), key=resolutions.count) if resolutions else 'Unknown'
            avg_fps = sum(fps_values) / len(fps_values) if fps_values else 0
            
            return {
                'avg_duration': round(avg_duration, 1),
                'resolution': most_common_resolution,
                'fps': round(avg_fps, 1),
                'total_duration_min': round(total_duration_min, 1),
                'samples_analyzed': samples_analyzed
            }
    
    except Exception as e:
        print(f"    Video analysis error: {e}")
    
    return {'avg_duration': 0, 'resolution': 'Analysis failed', 'fps': 0, 'total_duration_min': 0, 'samples_analyzed': 0}

print("Enhanced dataset info extraction with video parameters and license tracking ready!")

Enhanced dataset info extraction with video parameters and license tracking ready!


In [4]:
# Collect metadata from all found datasets
print("=== COLLECTING DATASET METADATA ===")
print("Gathering information from all SO100/SO101 datasets...")

dataset_info_list = []

for i, dataset_name in enumerate(all_so_datasets, 1):
    print(f"Processing {i}/{len(all_so_datasets)}: {dataset_name}")
    
    info = get_dataset_info_basic(dataset_name)
    dataset_info_list.append(info)
    
    # Rate limiting to be respectful to HF API
    time.sleep(0.5)
    
    # Progress update
    if i % 5 == 0:
        print(f"  --- Processed {i}/{len(all_so_datasets)} datasets ---")

# Create DataFrame
df_so_datasets = pd.DataFrame(dataset_info_list)

print(f"\n=== COLLECTION COMPLETE ===")
print(f"Processed {len(df_so_datasets)} datasets")
print(f"Successful: {len(df_so_datasets[df_so_datasets['status'] == 'success'])}")
print(f"Errors: {len(df_so_datasets[df_so_datasets['status'] == 'error'])}")

=== COLLECTING DATASET METADATA ===
Gathering information from all SO100/SO101 datasets...
Processing 1/2535: 00ri/eval_act_so100_battery_bin_center
Processing 2/2535: 00ri/so100_battery
Processing 3/2535: 00ri/so100_battery_bin_center
Processing 4/2535: 00ri/so100_test
Processing 5/2535: 0x00raghu/so100_bimanual_test_3
  --- Processed 5/2535 datasets ---
Processing 6/2535: 0x00raghu/so100_test
Processing 7/2535: 0x01/lerobot-so100-1
Processing 8/2535: 1145wang/so100_test
Processing 9/2535: 1909398036ljy/eval_act_so100_test
Processing 10/2535: 1909398036ljy/eval_act_so100_test1
  --- Processed 10/2535 datasets ---
Processing 11/2535: 1909398036ljy/so100_test
Processing 12/2535: 1909398036ljy/so100_test0
Processing 13/2535: 1909398036ljy/so100_test00
Processing 14/2535: 1909398036ljy/so100_test1
Processing 15/2535: 1909398036ljy/so100_test2
  --- Processed 15/2535 datasets ---
Processing 16/2535: 1909398036ljy/so100_test3
Processing 17/2535: 1909398036ljy/so100_test30
Processing 18/2535

In [5]:
# Filter datasets with at least 3 episodes and video files
print("=== FILTERING DATASETS ===")
print("Applying criteria: at least 3 episodes AND has video files")

# Filter successful datasets only
successful_df = df_so_datasets[df_so_datasets['status'] == 'success'].copy()

# Filter for datasets with at least 3 episodes AND video files
qualified_df = successful_df[
    (successful_df['estimated_episodes'] >= 3) & 
    (successful_df['video_files'] > 0)
].copy()

# Add LeRobot visualizer URLs for each dataset
qualified_df['visualizer_url'] = qualified_df['dataset_name'].apply(
    lambda x: f"https://huggingface.co/spaces/lerobot/visualize_dataset?path={x}"
)

print(f"Results:")
print(f"  Total datasets found: {len(df_so_datasets)}")
print(f"  Successfully processed: {len(successful_df)}")
print(f"  With video files: {len(successful_df[successful_df['video_files'] > 0])}")
print(f"  With ≥3 episodes: {len(successful_df[successful_df['estimated_episodes'] >= 3])}")
print(f"  Qualified (≥3 episodes AND has videos): {len(qualified_df)}")

if len(qualified_df) > 0:
    # Sort by number of episodes (descending)
    qualified_df = qualified_df.sort_values('estimated_episodes', ascending=False)
    
    print(f"\nQualified datasets (≥3 episodes AND has videos):")
    for _, row in qualified_df.iterrows():
        print(f"  {row['dataset_name']}: {row['estimated_episodes']} episodes, {row['video_files']} videos ({row['robot_type']})")
        print(f"    Visualizer: {row['visualizer_url']}")
    
    # Calculate aggregate statistics
    total_datasets = len(qualified_df)
    total_episodes = qualified_df['estimated_episodes'].sum()
    total_videos = qualified_df['video_files'].sum()
    so100_count = len(qualified_df[qualified_df['robot_type'] == 'SO100'])
    so101_count = len(qualified_df[qualified_df['robot_type'] == 'SO101'])
    
    print(f"\n=== AGGREGATE STATISTICS ===")
    print(f"Total qualified datasets: {total_datasets}")
    print(f"Total episodes: {total_episodes:,}")
    print(f"Total video files: {total_videos:,}")
    print(f"SO100 datasets: {so100_count}")
    print(f"SO101 datasets: {so101_count}")
    print(f"Average episodes per dataset: {total_episodes/total_datasets:.1f}")
    print(f"Average videos per dataset: {total_videos/total_datasets:.1f}")
    
    # Episode distribution
    print(f"\nEpisode distribution:")
    episode_counts = qualified_df['estimated_episodes'].value_counts().sort_index()
    for episodes, count in episode_counts.items():
        print(f"  {episodes} episodes: {count} dataset(s)")

else:
    print("\nNo datasets found with ≥3 episodes.")

=== FILTERING DATASETS ===
Applying criteria: at least 3 episodes AND has video files
Results:
  Total datasets found: 2535
  Successfully processed: 2535
  With video files: 2335
  With ≥3 episodes: 1471
  Qualified (≥3 episodes AND has videos): 1447

Qualified datasets (≥3 episodes AND has videos):
  Loki0929/so100_lan: 800 episodes, 2400 videos (SO100)
    Visualizer: https://huggingface.co/spaces/lerobot/visualize_dataset?path=Loki0929/so100_lan
  Loki0929/so100_lan_v20: 800 episodes, 2400 videos (SO100)
    Visualizer: https://huggingface.co/spaces/lerobot/visualize_dataset?path=Loki0929/so100_lan_v20
  HITHY/so100_peach4: 400 episodes, 400 videos (SO100)
    Visualizer: https://huggingface.co/spaces/lerobot/visualize_dataset?path=HITHY/so100_peach4
  alexppppp/lerobot-so100-blue-cap-white-cup: 360 episodes, 1080 videos (SO100)
    Visualizer: https://huggingface.co/spaces/lerobot/visualize_dataset?path=alexppppp/lerobot-so100-blue-cap-white-cup
  Loki0929/so100_duck: 300 episodes

In [6]:
# Create detailed analysis table
print("=== DETAILED ANALYSIS ===")

if len(qualified_df) > 0:
    # Create a formatted display table
    display_df = qualified_df.copy()
    
    # Format columns for better display - use full dataset names
    display_df['Dataset'] = display_df['dataset_name']
    display_df['Robot'] = display_df['robot_type']
    display_df['Episodes'] = display_df['estimated_episodes']
    display_df['Files'] = display_df['total_files']
    display_df['Video Files'] = display_df['video_files']
    # Show 'Unknown' for size when 0 bytes
    display_df['Size (GB)'] = display_df['size_bytes'].apply(
        lambda x: 'Unknown' if x == 0 else f"{(x / (1024**3)):.2f}"
    )
    display_df['Downloads'] = display_df['downloads']
    display_df['Likes'] = display_df['likes']
    display_df['License'] = display_df['license_name']
    display_df['Avg Video (sec)'] = display_df['avg_video_duration_sec'].round(1)
    display_df['Resolution'] = display_df['video_resolution']
    display_df['FPS'] = display_df['video_fps']
    display_df['Total Video (min)'] = display_df['total_video_duration_min'].round(1)
    display_df['Visualizer URL'] = display_df['visualizer_url']
    
    # Select columns for display
    display_cols = ['Dataset', 'Robot', 'Episodes', 'Files', 'Video Files', 
                   'Size (GB)', 'Downloads', 'Likes', 'License',
                   'Avg Video (sec)', 'Resolution', 'FPS', 'Total Video (min)']
    
    # Display formatted table
    print("\nDetailed Dataset Analysis (with Video Parameters and License Info):")
    print(display_df[display_cols].to_string(index=False, max_colwidth=30))
    
    # Show visualizer links separately for better readability
    print("\nLeRobot Dataset Visualizer Links:")
    for _, row in display_df.iterrows():
        print(f"  {row['Dataset']}: {row['Visualizer URL']}")
    
else:
    print("No datasets available for detailed analysis.")

=== DETAILED ANALYSIS ===

Detailed Dataset Analysis (with Video Parameters and License Info):
                       Dataset Robot  Episodes  Files  Video Files Size (GB)  Downloads  Likes                        License  Avg Video (sec) Resolution  FPS  Total Video (min)
            Loki0929/so100_lan SO100       800   3206         2400   Unknown        128      0             Apache License 2.0             45.0    480x640 30.0             1800.0
        Loki0929/so100_lan_v20 SO100       800   3206         2400   Unknown         78      0             Apache License 2.0             45.0    480x640 30.0             1800.0
            HITHY/so100_peach4 SO100       400    806          400   Unknown         64      0             Apache License 2.0             45.0    480x640 30.0              300.0
alexppppp/lerobot-so100-blu... SO100       360   1445         1080   Unknown         99      0                        Unknown             30.0    480x640 30.0              540.0
           Loki

In [7]:
# Export results to CSV with enhanced license information
if len(qualified_df) > 0:
    print("\n=== EXPORTING RESULTS ===")
    
    output_file = "so100_so101_datasets_analysis.csv"
    
    # Prepare export dataframe
    export_df = qualified_df.copy()
    export_df['dataset_url'] = export_df['dataset_name'].apply(lambda x: f"https://huggingface.co/datasets/{x}")
    
    # Select relevant columns for export including video parameters, license info, and visualizer URL
    export_columns = [
        'dataset_name', 'dataset_url', 'visualizer_url', 'robot_type', 'estimated_episodes', 
        'total_files', 'video_files', 'avg_video_duration_sec', 'video_resolution', 
        'video_fps', 'total_video_duration_min', 'video_analysis_samples',
        'size_bytes', 'downloads', 'likes', 
        'license', 'license_name', 'license_url',  # Added license fields
        'description', 'last_modified'
    ]
    
    export_df[export_columns].to_csv(output_file, index=False)
    
    print(f"Results exported to: {output_file}")
    print(f"CSV contains {len(export_df)} qualified datasets with video files, parameters, license info, and visualizer links")
    
    # Also save the full dataset info (including failed ones)
    full_output = "so100_so101_all_datasets.csv"
    df_so_datasets.to_csv(full_output, index=False)
    print(f"Complete dataset list saved to: {full_output}")
    
    # Export license summary as separate CSV
    license_output = "so100_so101_license_summary.csv"
    if len(qualified_df) > 0:
        # Recreate license summary for export
        license_export = qualified_df.groupby(['license_name']).agg({
            'dataset_name': 'count',
            'video_files': 'sum', 
            'total_video_duration_min': 'sum',
            'estimated_episodes': 'sum',
            'size_bytes': 'sum',
            'downloads': 'sum',
            'likes': 'sum'
        }).round(2)
        
        license_export.columns = ['dataset_count', 'total_video_files', 'total_video_duration_min',
                                 'total_episodes', 'total_size_bytes', 'total_downloads', 'total_likes']
        license_export['total_size_gb'] = (license_export['total_size_bytes'] / (1024**3)).round(2)
        license_export['total_video_duration_hours'] = (license_export['total_video_duration_min'] / 60).round(2)
        
        license_export.to_csv(license_output)
        print(f"License summary exported to: {license_output}")

else:
    print("No data to export.")

print(f"\n=== EXPLORATION COMPLETE ===")
print(f"Found {len(qualified_df) if len(qualified_df) > 0 else 0} SO100/SO101 datasets with ≥3 episodes AND video files")
print(f"Enhanced with video parameter analysis, comprehensive license tracking, and LeRobot visualizer links")


=== EXPORTING RESULTS ===
Results exported to: so100_so101_datasets_analysis.csv
CSV contains 1447 qualified datasets with video files, parameters, license info, and visualizer links
Complete dataset list saved to: so100_so101_all_datasets.csv
License summary exported to: so100_so101_license_summary.csv

=== EXPLORATION COMPLETE ===
Found 1447 SO100/SO101 datasets with ≥3 episodes AND video files
Enhanced with video parameter analysis, comprehensive license tracking, and LeRobot visualizer links


In [8]:
# License Summary Analysis
print("\n=== LICENSE SUMMARY ===")

if len(qualified_df) > 0:
    # Group by license for comprehensive analysis
    license_summary = qualified_df.groupby(['license_name']).agg({
        'dataset_name': 'count',
        'video_files': 'sum',
        'total_video_duration_min': 'sum',
        'estimated_episodes': 'sum',
        'size_bytes': 'sum',
        'downloads': 'sum',
        'likes': 'sum'
    }).round(2)
    
    # Rename columns for clarity
    license_summary.columns = ['Dataset Count', 'Video Files', 'Video Duration (min)', 
                              'Total Episodes', 'Total Size (bytes)', 'Total Downloads', 'Total Likes']
    
    # Convert size to GB
    license_summary['Total Size (GB)'] = (license_summary['Total Size (bytes)'] / (1024**3)).round(2)
    license_summary = license_summary.drop('Total Size (bytes)', axis=1)
    
    # Convert video duration to hours for better readability
    license_summary['Video Duration (hours)'] = (license_summary['Video Duration (min)'] / 60).round(2)
    
    # Sort by dataset count descending
    license_summary = license_summary.sort_values('Dataset Count', ascending=False)
    
    print("\nLicense Distribution Summary:")
    print(license_summary.to_string())
    
    # Additional license insights
    print("\nLicense Insights:")
    total_datasets = len(qualified_df)
    
    for license_name, data in license_summary.iterrows():
        percentage = (data['Dataset Count'] / total_datasets) * 100
        avg_episodes = data['Total Episodes'] / data['Dataset Count'] if data['Dataset Count'] > 0 else 0
        print(f"  {license_name}:")
        print(f"    {data['Dataset Count']} datasets ({percentage:.1f}% of total)")
        print(f"    {data['Video Duration (hours)']:.1f} hours of video content")
        print(f"    {avg_episodes:.1f} average episodes per dataset")
        print(f"    {data['Total Size (GB)']:.1f} GB total size")
        
    # Check for unknown/missing licenses
    unknown_licenses = qualified_df[qualified_df['license'].isin(['Unknown', 'unknown', None])]
    if len(unknown_licenses) > 0:
        print(f"\nWarning: {len(unknown_licenses)} datasets have unknown/missing license information")
        print("   Consider checking these datasets manually for licensing details.")
        
else:
    print("No qualified datasets available for license analysis.")


=== LICENSE SUMMARY ===

License Distribution Summary:
                                             Dataset Count  Video Files  Video Duration (min)  Total Episodes  Total Downloads  Total Likes  Total Size (GB)  Video Duration (hours)
license_name                                                                                                                                                                        
Apache License 2.0                                    1297        89122               63254.6           42760           114641           16              0.0                 1054.24
Unknown                                                144        11444                7842.8            4754             6170            3              0.0                  130.71
MIT License                                              4          900                 575.0             450              138            0              0.0                    9.58
Creative Commons Attribution-ShareAlike

In [10]:
# Summary statistics and visualization
print("=== FINAL SUMMARY ===")
print("SO100/SO101 Robot Dataset Exploration Results")
print("=" * 60)

if len(qualified_df) > 0:
    # Robot type breakdown
    robot_breakdown = qualified_df['robot_type'].value_counts()
    print(f"Robot Type Breakdown:")
    for robot, count in robot_breakdown.items():
        episodes_for_robot = qualified_df[qualified_df['robot_type'] == robot]['estimated_episodes'].sum()
        video_time_for_robot = qualified_df[qualified_df['robot_type'] == robot]['total_video_duration_min'].sum()
        print(f"  {robot}: {count} datasets, {episodes_for_robot} total episodes, {video_time_for_robot:.1f} min video")
    
    # License distribution in summary
    print(f"\nLicense Distribution:")
    license_breakdown = qualified_df['license_name'].value_counts()
    for license_name, count in license_breakdown.items():
        percentage = (count / len(qualified_df)) * 100
        print(f"  {license_name}: {count} datasets ({percentage:.1f}%)")
    
    # Top datasets by episodes
    print(f"\nTop 5 datasets by episode count:")
    top_datasets = qualified_df.nlargest(5, 'estimated_episodes')
    for _, row in top_datasets.iterrows():
        print(f"  {row['dataset_name']}: {row['estimated_episodes']} episodes, {row['video_files']} videos ({row['robot_type']})")
        print(f"    Video: {row['total_video_duration_min']:.1f} min total, {row['video_resolution']}, {row['video_fps']} FPS")
        print(f"    License: {row['license_name']}")
        print(f"    Visualizer: {row['visualizer_url']}")
    
    # Size and video analysis - only include datasets with known sizes
    datasets_with_size = qualified_df[qualified_df['size_bytes'] > 0]
    total_video_hours = qualified_df['total_video_duration_min'].sum() / 60
    
    print(f"\nStorage & Video Analysis:")
    if len(datasets_with_size) > 0:
        total_size_gb = datasets_with_size['size_bytes'].sum() / (1024**3)
        print(f"  Total size (known): {total_size_gb:.2f} GB ({len(datasets_with_size)}/{len(qualified_df)} datasets)")
        print(f"  Average dataset size (known): {total_size_gb/len(datasets_with_size):.2f} GB")
    else:
        print(f"  Total size: Unknown (no datasets have size information)")
    
    unknown_size_count = len(qualified_df[qualified_df['size_bytes'] == 0])
    if unknown_size_count > 0:
        print(f"  Datasets with unknown size: {unknown_size_count}")
    
    print(f"  Total video content: {total_video_hours:.2f} hours")
    print(f"  Average video per dataset: {qualified_df['total_video_duration_min'].mean():.1f} minutes")
    
    # Video quality metrics
    print(f"\nVideo Quality Metrics:")
    print(f"  Average video duration: {qualified_df['avg_video_duration_sec'].mean():.1f} seconds")
    print(f"  Most common resolution: {qualified_df['video_resolution'].mode().iloc[0] if not qualified_df['video_resolution'].mode().empty else 'Mixed'}")
    print(f"  Average FPS: {qualified_df['video_fps'].mean():.1f}")
    print(f"  Total video files: {qualified_df['video_files'].sum()}")
    
    # Engagement metrics
    total_downloads = qualified_df['downloads'].sum()
    total_likes = qualified_df['likes'].sum()
    print(f"\nCommunity Engagement:")
    print(f"  Total downloads: {total_downloads:,}")
    print(f"  Total likes: {total_likes}")
    
    # License compliance summary
    open_licenses = ['MIT License', 'Apache License 2.0', 'BSD 3-Clause License', 
                    'Creative Commons Attribution 4.0']
    open_license_count = qualified_df[qualified_df['license_name'].isin(open_licenses)].shape[0]
    print(f"\nLicense Compliance:")
    print(f"  Open source datasets: {open_license_count}/{len(qualified_df)} ({(open_license_count/len(qualified_df)*100):.1f}%)")
    if qualified_df['license_name'].str.contains('Unknown').any():
        unknown_count = qualified_df[qualified_df['license_name'] == 'Unknown'].shape[0]
        print(f"  Warning: Datasets with unknown licenses: {unknown_count}")

else:
    print("No SO100/SO101 datasets found with ≥3 episodes AND video files.")
    print("This might indicate:")
    print("  • Different naming conventions")
    print("  • Private datasets")
    print("  • Different file structures")
    print("  • Datasets without video content")

print(f"\nSearch completed at {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Enhanced analysis includes video parameters, comprehensive license tracking, and LeRobot visualizer links.")
print("Note: Only datasets with video files are included in final results.")
print("Video parameters are estimated based on file analysis and common robotics dataset patterns.")
print("Dataset sizes are shown as 'Unknown' when not provided by the HuggingFace API.")

=== FINAL SUMMARY ===
SO100/SO101 Robot Dataset Exploration Results
Robot Type Breakdown:
  SO100: 1369 datasets, 45433 total episodes, 68179.2 min video
  SO101: 78 datasets, 2734 total episodes, 3797.7 min video

License Distribution:
  Apache License 2.0: 1297 datasets (89.6%)
  Unknown: 144 datasets (10.0%)
  MIT License: 4 datasets (0.3%)
  Creative Commons Attribution-ShareAlike 4.0: 1 datasets (0.1%)
  GNU General Public License v3.0: 1 datasets (0.1%)

Top 5 datasets by episode count:
  Loki0929/so100_lan: 800 episodes, 2400 videos (SO100)
    Video: 1800.0 min total, 480x640, 30.0 FPS
    License: Apache License 2.0
    Visualizer: https://huggingface.co/spaces/lerobot/visualize_dataset?path=Loki0929/so100_lan
  Loki0929/so100_lan_v20: 800 episodes, 2400 videos (SO100)
    Video: 1800.0 min total, 480x640, 30.0 FPS
    License: Apache License 2.0
    Visualizer: https://huggingface.co/spaces/lerobot/visualize_dataset?path=Loki0929/so100_lan_v20
  HITHY/so100_peach4: 400 episod