# Generate Metadata (Captions, Labels, Comments)

This notebook generates captions, labels, and social media-style comments for all generated images.

**Workshop**: AI/ML Pipeline - Synthetic Data Generation  
**Date**: January 23, 2026  
**Platform**: CyVerse Jupyter Lab PyTorch GPU

## What This Notebook Does

For each generated image:
1. Generates descriptive captions
2. Generates semantic and categorical labels  
3. Generates social media-style comments
4. Saves all metadata with image associations
5. Exports CSV summaries for analysis

## Setup and Imports

In [None]:
import sys
from pathlib import Path
import time
from datetime import datetime
from IPython.display import display
from tqdm.notebook import tqdm
from PIL import Image

# Add parent directory to path
parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from src import config, gemini_client, output_handler

print("✓ All modules imported successfully")

## 1. Load Configuration and Find Images

In [None]:
# Load configuration
cfg = config.load_config()

# Initialize output handler
output_dir = cfg.get_output_path()
handler = output_handler.OutputHandler(
    output_dir=output_dir,
    image_format=cfg.output['image_format'],
    metadata_format=cfg.output['metadata_format'],
    export_csv=cfg.output['export_csv_summaries']
)

# Find all generated images
image_files = list(handler.images_dir.glob(f"*.{cfg.output['image_format']}"))
image_files.sort()

print(f"Found {len(image_files)} images to process")
print(f"Images directory: {handler.images_dir}")

if len(image_files) == 0:
    print("\n⚠ No images found! Please run notebook 03_generate_images.ipynb first.")
    raise FileNotFoundError("No images to process")

## 2. Initialize Text Generator

Set up Gemini API client for text generation.

In [None]:
# Initialize rate limiter
rate_limiter = gemini_client.RateLimiter(
    requests_per_minute=cfg.rate_limiting['requests_per_minute'],
    requests_per_day=cfg.rate_limiting['requests_per_day']
)

# Initialize text generator
text_generator = gemini_client.GeminiTextGenerator(
    api_key=cfg.api_key,
    rate_limiter=rate_limiter
)

print("✓ Text generator initialized")
print(f"  Rate limit: {cfg.rate_limiting['requests_per_minute']} requests/minute")

## 3. Get Metadata Configuration

In [None]:
metadata_config = cfg.metadata

print("Metadata Generation Settings:")
print("=" * 80)
print(f"Comments per image: {metadata_config['num_comments_per_image']}")
print(f"Include hashtags: {metadata_config['include_hashtags']}")
print(f"Include emojis: {metadata_config['include_emojis']}")
print(f"Label categories: {', '.join(metadata_config['label_categories'])}")
print("=" * 80)

## 4. Generate Captions

Generate descriptive captions for all images.

In [None]:
print("\nGenerating captions...\n")

caption_errors = 0
start_time = time.time()

for image_file in tqdm(image_files, desc="Captions"):
    try:
        # Extract image ID from filename
        image_id = image_file.stem
        
        # Load image
        img = Image.open(image_file)
        
        # Load original metadata for context
        original_metadata = handler.load_metadata(image_id)
        context = None
        if original_metadata:
            context = f"Social movement scene: {original_metadata.get('prompt', '')[:200]}"
        
        # Generate caption
        caption = text_generator.generate_caption(img, context=context)
        
        # Save caption
        handler.save_caption(
            image_id=image_id,
            caption=caption,
            context={'original_prompt': context}
        )
        
    except Exception as e:
        caption_errors += 1
        print(f"\nError generating caption for {image_file.name}: {e}")
        continue

elapsed = time.time() - start_time
print(f"\n✓ Captions generated: {len(image_files) - caption_errors}/{len(image_files)}")
print(f"  Time: {int(elapsed//60)}m {int(elapsed%60)}s")
print(f"  Errors: {caption_errors}")

### Preview Sample Captions

In [None]:
import random
import json

# Get sample captions
caption_files = list(handler.captions_dir.glob("*_caption.json"))
sample_captions = random.sample(caption_files, min(3, len(caption_files)))

print("Sample Captions:")
print("=" * 80)

for caption_file in sample_captions:
    with open(caption_file, 'r') as f:
        caption_data = json.load(f)
    
    print(f"\nImage: {caption_data['image_id']}")
    print(f"Caption: {caption_data['caption']}")
    print("-" * 80)

## 5. Generate Labels

Generate semantic and categorical labels for all images.

In [None]:
print("\nGenerating labels...\n")

label_errors = 0
start_time = time.time()

label_categories = metadata_config['label_categories']

for image_file in tqdm(image_files, desc="Labels"):
    try:
        image_id = image_file.stem
        img = Image.open(image_file)
        
        # Load context
        original_metadata = handler.load_metadata(image_id)
        context = None
        if original_metadata:
            context = f"Social movement context: {original_metadata.get('prompt', '')[:200]}"
        
        # Generate labels
        labels = text_generator.generate_labels(
            img,
            categories=label_categories,
            context=context
        )
        
        # Save labels
        handler.save_labels(
            image_id=image_id,
            labels=labels,
            context={'categories': label_categories}
        )
        
    except Exception as e:
        label_errors += 1
        print(f"\nError generating labels for {image_file.name}: {e}")
        continue

elapsed = time.time() - start_time
print(f"\n✓ Labels generated: {len(image_files) - label_errors}/{len(image_files)}")
print(f"  Time: {int(elapsed//60)}m {int(elapsed%60)}s")
print(f"  Errors: {label_errors}")

### Preview Sample Labels

In [None]:
# Get sample labels
label_files = list(handler.labels_dir.glob("*_labels.json"))
sample_labels = random.sample(label_files, min(3, len(label_files)))

print("Sample Labels:")
print("=" * 80)

for label_file in sample_labels:
    with open(label_file, 'r') as f:
        label_data = json.load(f)
    
    print(f"\nImage: {label_data['image_id']}")
    print("Labels:")
    for category, label in label_data['labels'].items():
        print(f"  {category}: {label}")
    print("-" * 80)

## 6. Generate Comments

Generate social media-style comments for all images.

In [None]:
print("\nGenerating comments...\n")

comment_errors = 0
start_time = time.time()

num_comments = metadata_config['num_comments_per_image']
include_hashtags = metadata_config['include_hashtags']
include_emojis = metadata_config['include_emojis']

for image_file in tqdm(image_files, desc="Comments"):
    try:
        image_id = image_file.stem
        img = Image.open(image_file)
        
        # Load context
        original_metadata = handler.load_metadata(image_id)
        context = None
        if original_metadata:
            context = f"Social movement image. Theme: {original_metadata.get('source_data', {}).get('atropia', {}).get('theme', 'civic engagement')}"
        
        # Generate comments
        comments = text_generator.generate_comments(
            img,
            num_comments=num_comments,
            include_hashtags=include_hashtags,
            include_emojis=include_emojis,
            context=context
        )
        
        # Save comments
        handler.save_comments(
            image_id=image_id,
            comments=comments,
            context={'num_requested': num_comments}
        )
        
    except Exception as e:
        comment_errors += 1
        print(f"\nError generating comments for {image_file.name}: {e}")
        continue

elapsed = time.time() - start_time
print(f"\n✓ Comments generated: {len(image_files) - comment_errors}/{len(image_files)}")
print(f"  Time: {int(elapsed//60)}m {int(elapsed%60)}s")
print(f"  Errors: {comment_errors}")

### Preview Sample Comments

In [None]:
# Get sample comments
comment_files = list(handler.comments_dir.glob("*_comments.json"))
sample_comments = random.sample(comment_files, min(3, len(comment_files)))

print("Sample Comments:")
print("=" * 80)

for comment_file in sample_comments:
    with open(comment_file, 'r') as f:
        comment_data = json.load(f)
    
    print(f"\nImage: {comment_data['image_id']}")
    print("Comments:")
    for i, comment in enumerate(comment_data['comments'], 1):
        print(f"  {i}. {comment}")
    print("-" * 80)

## 7. Export CSV Summaries

Export all metadata to CSV files for easy analysis.

In [None]:
print("Exporting CSV summaries...\n")

# Export captions
captions_csv = handler.export_captions_csv()
if captions_csv:
    print(f"✓ Captions exported to: {captions_csv}")

# Export labels
labels_csv = handler.export_labels_csv()
if labels_csv:
    print(f"✓ Labels exported to: {labels_csv}")

# Export comments
comments_csv = handler.export_comments_csv()
if comments_csv:
    print(f"✓ Comments exported to: {comments_csv}")

## 8. Check Completeness

Verify all images have complete metadata.

In [None]:
completeness = handler.check_completeness()

print("\nDataset Completeness Check:")
print("=" * 80)
print(f"Total images: {completeness['total_images']}")
print(f"\nMetadata Coverage:")
print(f"  Captions: {completeness['images_with_captions']} ({completeness['caption_coverage']:.1f}%)")
print(f"  Labels: {completeness['images_with_labels']} ({completeness['label_coverage']:.1f}%)")
print(f"  Comments: {completeness['images_with_comments']} ({completeness['comment_coverage']:.1f}%)")

if completeness['fully_complete']:
    print("\n✓ Dataset is fully complete! All images have all metadata.")
else:
    print("\n⚠ Some images are missing metadata. Check errors above.")

print("=" * 80)

## Summary

Metadata generation complete! Your dataset now includes:
- Descriptive captions for each image
- Semantic and categorical labels
- Social media-style comments
- CSV exports for easy analysis

**Next Step**: Run notebook `05_quality_assurance.ipynb` to validate the complete dataset.