# Experiment Metadata Utilities Demo

This notebook demonstrates how to use the experiment metadata utilities for robust image path lookup and metadata queries.

In [2]:
# Add utils directory to path
import sys
from pathlib import Path
sys.path.append(str(Path('/net/trapnell/vol1/home/mdcolon/proj/morphseq/segmentation_sandbox/scripts/utils')))

# Import our utilities
from experiment_metadata_utils import (
    load_experiment_metadata,
    get_image_id_path,
    get_image_path_fast,
    verify_image_exists,
    find_video_for_image,
    get_metadata_summary,
    list_all_image_ids
)

In [4]:
# Define metadata path
metadata_path = Path("/net/trapnell/vol1/home/mdcolon/proj/morphseq/segmentation_sandbox/data/raw_data_organized/experiment_metadata.json")
print(f"Metadata file exists: {metadata_path.exists()}")

Metadata file exists: True


In [5]:
# Get metadata summary
summary = get_metadata_summary(metadata_path)
print("Metadata Summary:")
for key, value in summary.items():
    if key == "experiment_ids":
        print(f"  {key}: {value[:5]}... (showing first 5)")
    else:
        print(f"  {key}: {value}")

Metadata Summary:
  total_experiments: 16
  total_videos: 1391
  total_images: 84226
  experiment_ids: ['20240812', '20231110', '20231206', '20231218', '20240306']... (showing first 5)
  creation_time: 2025-06-27T14:45:58.619300
  script_version: 01_prepare_videos.py


## Method 1: Single Image Lookup (loads metadata each time)

Good for one-off queries or when working with just a few images.

In [6]:
# Test with a valid image_id
test_image_id = "20241023_A01_0000"

try:
    # Method 1: Pass metadata path (loads metadata each call)
    image_path = get_image_id_path(test_image_id, metadata_path)
    print(f"Image path: {image_path}")
    print(f"File exists: {image_path.exists()}")
    
    # Get video information
    video_info = find_video_for_image(test_image_id, metadata_path)
    if video_info:
        print(f"Video ID: {video_info['video_id']}")
        print(f"Total images in video: {len(video_info['image_ids'])}")
        
except ValueError as e:
    print(f"Image not found: {e}")
except FileNotFoundError as e:
    print(f"File missing: {e}")

Image path: /net/trapnell/vol1/home/mdcolon/proj/morphseq/segmentation_sandbox/data/raw_data_organized/20241023/images/20241023_A01/20241023_A01_0000.jpg
File exists: True
Video ID: 20241023_A01
Total images in video: 43


## Method 2: Batch Processing (loads metadata once)

More efficient when working with many images - loads metadata once and reuses it.

In [7]:
# Load metadata once for efficient batch processing
metadata = load_experiment_metadata(metadata_path)
print(f"Metadata loaded. Total images: {len(metadata['image_ids'])}")

# Get some example image_ids to test with
sample_image_ids = metadata['image_ids'][:10]  # First 10 images
print(f"\nTesting with {len(sample_image_ids)} sample images...")

Metadata loaded. Total images: 84226

Testing with 10 sample images...


In [8]:
# Process multiple images efficiently
results = []

for image_id in sample_image_ids:
    try:
        # Use loaded metadata for efficiency
        image_path = get_image_id_path(image_id, metadata)
        exists = image_path.exists()
        
        results.append({
            'image_id': image_id,
            'path': str(image_path),
            'exists': exists
        })
        
    except (ValueError, FileNotFoundError) as e:
        results.append({
            'image_id': image_id,
            'path': None,
            'exists': False,
            'error': str(e)
        })

# Show results
print(f"Processed {len(results)} images:")
for result in results[:5]:  # Show first 5
    if 'error' in result:
        print(f"  {result['image_id']}: ERROR - {result['error']}")
    else:
        print(f"  {result['image_id']}: {'✓' if result['exists'] else '✗'} {Path(result['path']).name}")

# Summary
existing_count = sum(1 for r in results if r['exists'])
error_count = sum(1 for r in results if 'error' in r)
print(f"\nSummary: {existing_count}/{len(results)} files exist, {error_count} errors")

Processed 10 images:
  20240812_E12_0000: ✓ 20240812_E12_0000.jpg
  20240812_E12_0001: ✓ 20240812_E12_0001.jpg
  20240812_E12_0002: ✓ 20240812_E12_0002.jpg
  20240812_E12_0003: ✓ 20240812_E12_0003.jpg
  20240812_E12_0004: ✓ 20240812_E12_0004.jpg

Summary: 10/10 files exist, 0 errors


## Error Handling Demo

Test how the utility handles invalid image_ids.

In [None]:
# Test with invalid image_ids
invalid_image_ids = [
    "20241023_A01_9999",  # Valid format but doesn't exist
    "20999999_Z99_0000",  # Non-existent experiment
    "invalid_format",     # Invalid format
]

print("Testing error handling:")
for invalid_id in invalid_image_ids:
    try:
        path = get_image_id_path(invalid_id, metadata)
        print(f"  {invalid_id}: ✓ Found at {path}")
    except ValueError as e:
        print(f"  {invalid_id}: ✗ ValueError - {e}")
    except FileNotFoundError as e:
        print(f"  {invalid_id}: ✗ FileNotFoundError - {e}")
    except Exception as e:
        print(f"  {invalid_id}: ✗ Other error - {e}")

## Performance Comparison

Compare the performance of robust metadata lookup vs fast path construction.

In [9]:
import time

# Get a sample of image_ids for timing test
test_ids = sample_image_ids[:5]

# Time Method 1: Loading metadata each time
start_time = time.time()
for image_id in test_ids:
    try:
        path = get_image_id_path(image_id, metadata_path)  # Load metadata each time
    except:
        pass
method1_time = time.time() - start_time

# Time Method 2: Using loaded metadata
start_time = time.time()
for image_id in test_ids:
    try:
        path = get_image_id_path(image_id, metadata)  # Use loaded metadata
    except:
        pass
method2_time = time.time() - start_time

# Time Method 3: Fast path construction
start_time = time.time()
for image_id in test_ids:
    try:
        path = get_image_path_fast(image_id)
    except:
        pass
method3_time = time.time() - start_time

print(f"Performance comparison for {len(test_ids)} images:")
print(f"  Method 1 (load metadata each time): {method1_time:.4f}s")
print(f"  Method 2 (use loaded metadata):     {method2_time:.4f}s")
print(f"  Method 3 (fast path construction):  {method3_time:.4f}s")
print(f"\nSpeedup with loaded metadata: {method1_time/method2_time:.1f}x")
print(f"Fast construction speedup:    {method1_time/method3_time:.1f}x")

Performance comparison for 5 images:
  Method 1 (load metadata each time): 0.1056s
  Method 2 (use loaded metadata):     0.0003s
  Method 3 (fast path construction):  0.0001s

Speedup with loaded metadata: 317.0x
Fast construction speedup:    902.0x


## Recommendation

- **For single queries**: Use `get_image_id_path(image_id, metadata_path)` - simple and robust
- **For batch processing**: Load metadata once with `load_experiment_metadata()`, then use `get_image_id_path(image_id, metadata)` for each image - much faster
- **For maximum speed**: Use `get_image_path_fast()` if you're confident about the directory structure and don't need metadata validation
- **Always handle errors**: Wrap calls in try/except blocks to handle missing images gracefully