# 📄 Tropy API Multimodal Analysis & Summarization Notebook

This notebook offers an integrated, step-by-step workflow for analyzing Tropy collections with **multimodal large language models (MMLMs)**.

## 🤖 Multi-Model Support
- **Google Gemini** (gemini-1.5-flash) 
- **OpenAI GPT-4** (gpt-4o-mini) 
- **Anthropic Claude** (claude-sonnet-4) 
- Easy API key configuration with secure `.env` storage

## 🗂️ Main Features
- Connects to Tropy’s local API to extract item and photo metadata
- Loads transcriptions (when available) and image files
- Generates **photo-level summaries**, prioritizing transcriptions while leveraging image context
- Creates **item-level summaries** by synthesizing photo-level content
- Optionally produces **semantic embeddings** for advanced search, clustering, or recommendation tasks
- Writes results back to Tropy as notes, and also saves outputs to local JSON files
- Supports batch checkpointing and resume to handle large collections

## ⚡ Inspiration
This workflow builds on and adapts ideas from Taylor Arnold and Lauren Tilton’s [Explainable Search and Discovery of Visual Cultural Heritage Collections with Multimodal Large Language Models](https://2024.computational-humanities-research.org/papers/paper28/). It was developed for the DH2025 workshop *Transcribing the Vatican Archives: Contextualization, Limits, and Opportunities*, led by Anita Lucchesi and Sean Takats.

## 1. Setup and Configuration

In [None]:
# Cell 1: Import Required Libraries

import os
import json
import time
import requests
import numpy as np
from pathlib import Path
from PIL import Image
from io import BytesIO
from tqdm.notebook import tqdm
import pandas as pd
from pdf2image import convert_from_path
from dotenv import load_dotenv
import base64
from typing import Optional, List, Dict, Any
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ All dependencies imported successfully")

In [None]:
# Cell 2: Configure API Keys and Select Model

print("🔧 Setting up AI Model Providers...\n")

# Load environment variables
load_dotenv()

# Check and configure API keys
providers_available = []

# Helper function to add/check API key
def check_api_key(provider_name, env_var, display_name):
    if os.getenv(env_var):
        return True
    else:
        print(f"📝 {provider_name} API key not found")
        add_key = input(f"   Add {provider_name} API key? (y/n) [n]: ").strip().lower()
        if add_key == 'y':
            api_key = input(f"   Paste your {provider_name} API key: ").strip()
            if api_key:
                with open('.env', 'a') as f:
                    f.write(f"\n{env_var}={api_key}\n")
                load_dotenv()
                print(f"   ✅ Added {provider_name} key to .env")
                return True
        return False

# Check each provider
print("📊 Checking available providers:\n")

# Google Gemini
if check_api_key("Google", "GOOGLE_API_KEY", "Gemini"):
    try:
        import google.generativeai as genai  # Import here!
        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
        providers_available.append(("gemini", "Google Gemini (gemini-1.5-flash)"))
    except ImportError:
        print("   ⚠️  Google AI library not installed. Run: pip install google-generativeai")

# OpenAI
if check_api_key("OpenAI", "OPENAI_API_KEY", "GPT-4"):
    try:
        import openai
        providers_available.append(("openai", "OpenAI (gpt-4o-mini)"))
    except ImportError:
        print("   ⚠️  OpenAI library not installed. Run: pip install openai")

# Anthropic
if check_api_key("Anthropic", "ANTHROPIC_API_KEY", "Claude"):
    try:
        import anthropic
        providers_available.append(("claude", "Anthropic Claude (claude-3.5-sonnet)"))
    except ImportError:
        print("   ⚠️  Anthropic library not installed. Run: pip install anthropic")

# Select model
print(f"\n📋 Available models: {len(providers_available)}")

if len(providers_available) == 0:
    print("❌ No models available. Please configure at least one API key.")
    raise ValueError("No API keys configured")
elif len(providers_available) == 1:
    selected = providers_available[0]
    globals()['selected_model_provider'] = selected[0]
    print(f"✅ Using: {selected[1]}")
else:
    print("\n🎯 Select your model:")
    for i, (key, name) in enumerate(providers_available):
        print(f"   {i+1}. {name}")
    
    choice = input(f"\nYour choice (1-{len(providers_available)}) [1]: ").strip() or "1"
    try:
        idx = int(choice) - 1
        selected = providers_available[idx]
        globals()['selected_model_provider'] = selected[0]
        print(f"\n✅ Selected: {selected[1]}")
    except:
        selected = providers_available[0]
        globals()['selected_model_provider'] = selected[0]
        print(f"\n✅ Defaulting to: {selected[1]}")

# Quick test
print("\n🧪 Testing selected model...")
try:
    if globals()['selected_model_provider'] == 'gemini':
        # genai is already imported above if Google was selected
        model = genai.GenerativeModel('gemini-1.5-flash')
        response = model.generate_content("Say 'Hello Tropy' in exactly 2 words")
        print(f"✅ Model test successful: {response.text.strip()}")
    else:
        print("✅ Model configured (test available after adapter setup)")
except Exception as e:
    print(f"⚠️  Test failed: {e}")
    print("   Model will be tested when first used")

print("\n✨ Setup complete! Ready to process your collection.")

In [None]:
# Cell 3: Configuration Settings

class TropyAIConfig:
    """Central configuration for the Tropy AI workflow."""
    
    def __init__(self):
        # API Settings
        self.TROPY_API_BASE = 'http://localhost:2019'
        
        # Model configuration will be set after user selection
        self.GENERATIVE_MODEL = None
        self.EMBEDDING_MODEL = None
        
        # Rate Limits and Delays
        self.API_IMAGE_DELAY = 1.5
        self.API_NOTE_DELAY = 3.0
        self.IMAGE_TIMEOUT = 30
        self.RATE_LIMIT_DELAY = 2.0
        self.ROLLING_WINDOW_SIZE = 3
        
        # File Management
        self.CHECKPOINT_FILE = "processing_checkpoint.json"
        self.OUTPUT_DIR = "./output"
        self.EMBEDDINGS_FILE = "tropy_embeddings.json"
        
        # Batch Processing
        self.BATCH_SIZE = 50
        self.CHECKPOINT_INTERVAL = 25

config = TropyAIConfig()

# Configure models based on user selection from previous cell
selected_provider = globals().get('selected_model_provider', 'gemini')

if selected_provider == 'gemini':
    config.GENERATIVE_MODEL = 'gemini-1.5-flash'
    config.EMBEDDING_MODEL = 'models/embedding-001'
elif selected_provider == 'openai':
    config.GENERATIVE_MODEL = 'gpt-4o-mini'
    config.EMBEDDING_MODEL = 'text-embedding-3-small'
elif selected_provider == 'claude':
    config.GENERATIVE_MODEL = 'claude-3-5-sonnet-20241022'
    config.EMBEDDING_MODEL = None  # Claude doesn't have native embeddings
else:
    # Fallback to Gemini
    config.GENERATIVE_MODEL = 'gemini-1.5-flash'
    config.EMBEDDING_MODEL = 'models/embedding-001'

print(f"✅ Configuration initialized for {config.GENERATIVE_MODEL}")
print(f"   Embedding model: {config.EMBEDDING_MODEL or 'Not available'}")

## 2. Core Processing Components

In [None]:
# Cell 4: Image Processing

class ImageProcessor:
    """Handles image loading from Tropy API."""
    
    def __init__(self, config):
        self.config = config
        self.session = requests.Session()
        self.last_api_call_time = 0
    
    def _wait_for_api_delay(self):
        """Respect API rate limits."""
        current_time = time.time()
        time_since_last_call = current_time - self.last_api_call_time
        if time_since_last_call < self.config.API_IMAGE_DELAY:
            sleep_time = self.config.API_IMAGE_DELAY - time_since_last_call
            time.sleep(sleep_time)
        self.last_api_call_time = time.time()
    
    def load_image_from_api(self, photo_id: int) -> Optional[Image.Image]:
        """Load image from Tropy API."""
        self._wait_for_api_delay()
        image_url = f"{self.config.TROPY_API_BASE}/project/photos/{photo_id}/file.jpg"
        
        try:
            response = self.session.get(image_url, timeout=self.config.IMAGE_TIMEOUT)
            response.raise_for_status()
            
            image = Image.open(BytesIO(response.content))
            
            # Convert RGBA to RGB if needed
            if image.mode == 'RGBA':
                background = Image.new('RGB', image.size, (255, 255, 255))
                background.paste(image, mask=image.split()[3])
                image = background
            elif image.mode not in ['RGB', 'L']:
                image = image.convert('RGB')
            
            return image
        except Exception as e:
            logger.warning(f"Failed to load image from API: {e}")
            return None
    
    def load_image(self, photo_data: Dict[str, Any]) -> Optional[Image.Image]:
        """Load image with local fallback."""
        photo_id = photo_data.get("id")
        image = self.load_image_from_api(photo_id)
        
        if not image:
            # Try local path as fallback
            local_path = photo_data.get("path", "")
            if local_path and os.path.exists(local_path):
                try:
                    image = Image.open(local_path)
                    if image.mode == 'RGBA':
                        background = Image.new('RGB', image.size, (255, 255, 255))
                        background.paste(image, mask=image.split()[3])
                        image = background
                    return image
                except Exception as e:
                    logger.warning(f"Failed to load from local path: {e}")
        
        return image

print("✅ Image processor defined")

In [None]:
# Cell 5: Model Adapter Classes

import time
from io import BytesIO
import base64

def get_scholarly_analysis_prompt(transcription="", context=None):
    """Generate the scholarly analysis prompt used across all model adapters."""
    context_str = ""
    if context and len(context) > 0:
        recent_context = context[-3:]  # Use rolling window of 3
        context_str = "\n\nContext from previous pages:\n" + "\n".join(recent_context)
    
    return f"""
You are a highly specialized historian and archival researcher analyzing an archival photo that may contain one or more document pages.

- Primary source of textual content: Use the transcription text provided whenever available, prioritizing it for all factual and content descriptions.
- Visual analysis: Use the image primarily to describe physical features (seals, stamps, layout, handwritten notes, overlays, partial folds, glued items, postcards) and overall structure.
- Multiple parts: Be aware that a single image may include multiple overlapping documents or layers. Focus your description on the topmost active document, often marked by a visible folio number or placed upfront. Briefly mention secondary or underlying pages only when clearly visible and relevant.
- Authenticity and physical features: Describe archival markers, seals, stamps, paper types, watermarks, or other signs of authenticity.
- Transparency: If handwriting, language, or image quality limits your ability to extract text, explicitly state this rather than guessing or fabricating details.
- Caution: Avoid inferring content not clearly visible or transcribed.
- Languages: Whenever possible, indicate which language is being used across the document.

**Context from previous pages (if any):**
{context_str}

**Your summary should:**
- Be concise and scholarly (around 3–6 sentences).
- Integrate textual and visual analysis smoothly in one narrative paragraph.
- Clearly mention the document type, purpose, people, places, dates, and institutions when available.
- Explicitly state if the transcription was used (e.g., "Based on available transcription") or if only partial text could be interpreted.

Transcription (preferred source): {transcription or 'No transcription available.'}

**Note: This summary is machine-generated and should be verified by a researcher.**
"""

def apply_rate_limit(last_api_call, rate_limit_delay):
    """Apply rate limiting between API calls."""
    current_time = time.time()
    time_since_last = current_time - last_api_call
    if time_since_last < rate_limit_delay:
        sleep_time = rate_limit_delay - time_since_last
        time.sleep(sleep_time)
    return time.time()

def image_to_base64(image):
    """Convert PIL image to base64 string."""
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

class ModelAdapter:
    """Base adapter interface for all models"""
    def generate_summary(self, image, transcription="", context=None):
        raise NotImplementedError
    
    def generate_embedding(self, image, transcription=""):
        raise NotImplementedError

class GeminiAdapter(ModelAdapter):
    """Adapter for Google Gemini"""
    def __init__(self, config):
        self.config = config
        if not os.getenv("GOOGLE_API_KEY"):
            raise ValueError("Google API key not configured")
        import google.generativeai as genai
        self.generative_model = genai.GenerativeModel(config.GENERATIVE_MODEL)
        self.last_api_call = 0
    
    def generate_summary(self, image, transcription="", context=None):
        """Generate scholarly summary of document page."""
        self.last_api_call = apply_rate_limit(self.last_api_call, self.config.RATE_LIMIT_DELAY)
        
        prompt = get_scholarly_analysis_prompt(transcription, context)
        
        try:
            response = self.generative_model.generate_content([prompt, image])
            return response.text if response else None
        except Exception as e:
            logger.error(f"Error generating summary: {e}")
            return None
    
    def generate_embedding(self, image, transcription=""):
        """Generate embedding for semantic search."""
        self.last_api_call = apply_rate_limit(self.last_api_call, self.config.RATE_LIMIT_DELAY)
        
        visual_prompt = (
            "Describe this historical document image in detail. "
            "Focus on layout, visual elements, text density, and document type."
        )
        
        try:
            import google.generativeai as genai
            visual_response = self.generative_model.generate_content([visual_prompt, image])
            if not visual_response:
                return None
            
            combined_text = f"Visual: {visual_response.text}\nTranscription: {transcription or 'N/A'}"
            embedding_response = genai.embed_content(
                model=self.config.EMBEDDING_MODEL, 
                content=combined_text
            )
            
            return embedding_response.get('embedding') if embedding_response else None
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return None

class OpenAIAdapter(ModelAdapter):
    """Adapter for OpenAI models"""
    def __init__(self, config):
        import openai
        from openai import OpenAI
        
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.config = config
        self.last_api_call = 0
    
    def generate_summary(self, image, transcription="", context=None):
        self.last_api_call = apply_rate_limit(self.last_api_call, self.config.RATE_LIMIT_DELAY)
        
        prompt = get_scholarly_analysis_prompt(transcription, context)
        
        try:
            base64_image = image_to_base64(image)
            
            response = self.client.chat.completions.create(
                model=self.config.GENERATIVE_MODEL,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{base64_image}",
                                    "detail": "high"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=500
            )
            
            return response.choices[0].message.content
        except Exception as e:
            logger.error(f"Error generating summary with OpenAI: {e}")
            return None
    
    def generate_embedding(self, image, transcription=""):
        self.last_api_call = apply_rate_limit(self.last_api_call, self.config.RATE_LIMIT_DELAY)
        
        # First get visual description
        visual_prompt = (
            "Describe this historical document image in detail. "
            "Focus on layout, visual elements, text density, and document type."
        )
        
        try:
            base64_image = image_to_base64(image)
            
            response = self.client.chat.completions.create(
                model=self.config.GENERATIVE_MODEL,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": visual_prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{base64_image}",
                                    "detail": "low"  # Lower detail for embedding
                                }
                            }
                        ]
                    }
                ],
                max_tokens=300
            )
            
            visual_description = response.choices[0].message.content
            
            # Combine with transcription and generate embedding
            combined_text = f"Visual: {visual_description}\nTranscription: {transcription or 'N/A'}"
            
            embedding_response = self.client.embeddings.create(
                model=self.config.EMBEDDING_MODEL,
                input=combined_text[:8000]  # Limit length
            )
            
            return embedding_response.data[0].embedding
        except Exception as e:
            logger.error(f"Error generating embedding with OpenAI: {e}")
            return None

class ClaudeAdapter(ModelAdapter):
    """Adapter for Anthropic Claude"""
    def __init__(self, config):
        import anthropic
        
        self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
        self.config = config
        self.last_api_call = 0
    
    def generate_summary(self, image, transcription="", context=None):
        self.last_api_call = apply_rate_limit(self.last_api_call, self.config.RATE_LIMIT_DELAY)
        
        prompt = get_scholarly_analysis_prompt(transcription, context)
        
        try:
            base64_image = image_to_base64(image)
            
            message = self.client.messages.create(
                model=self.config.GENERATIVE_MODEL,
                max_tokens=500,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/png",
                                    "data": base64_image
                                }
                            }
                        ]
                    }
                ]
            )
            
            return message.content[0].text
        except Exception as e:
            logger.error(f"Error generating summary with Claude: {e}")
            return None
    
    def generate_embedding(self, image, transcription=""):
        # Use OpenAI for embeddings since Claude doesn't provide them
        self.last_api_call = apply_rate_limit(self.last_api_call, self.config.RATE_LIMIT_DELAY)
        
        try:
            # Check if OpenAI is available
            if not os.getenv("OPENAI_API_KEY"):
                logger.info("No OpenAI key for embeddings - skipping")
                return None
                
            from openai import OpenAI
            openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
            
            # First get visual description using Claude
            visual_prompt = (
                "Describe this historical document image in detail. "
                "Focus on layout, visual elements, text density, and document type."
            )
            
            base64_image = image_to_base64(image)
            
            message = self.client.messages.create(
                model=self.config.GENERATIVE_MODEL,
                max_tokens=300,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": visual_prompt},
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": base64_image
                            }
                        }
                    ]
                }]
            )
            
            visual_description = message.content[0].text
            
            # Combine with transcription and generate embedding using OpenAI
            combined_text = f"Visual: {visual_description}\nTranscription: {transcription or 'N/A'}"
            
            # Use hardcoded OpenAI embedding model since Claude doesn't do embeddings
            embedding_response = openai_client.embeddings.create(
                model="text-embedding-3-small",
                input=combined_text[:8000]  # Limit length
            )
            
            return embedding_response.data[0].embedding
            
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return None

# Factory function to get the right adapter
def get_model_adapter(config):
    """Returns the appropriate model adapter based on user selection"""
    provider = globals().get('selected_model_provider', 'gemini')
    
    if provider == 'gemini':
        return GeminiAdapter(config)
    elif provider == 'openai':
        return OpenAIAdapter(config)
    elif provider == 'claude':
        return ClaudeAdapter(config)
    else:
        # Default fallback
        return GeminiAdapter(config)

print("✅ Model adapters configured")

In [None]:
# Cell 6: Tropy API Integration

class TropyAPIClient:
    """Handles all Tropy API interactions."""
    
    def __init__(self, config):
        self.config = config
        self.session = requests.Session()
        self.api_base = config.TROPY_API_BASE
    
    def fetch_all_items(self) -> List[Dict[str, Any]]:
        """Fetch all items from Tropy."""
        try:
            response = self.session.get(f'{self.api_base}/project/items/')
            response.raise_for_status()
            items = response.json()
            logger.info(f"Fetched {len(items)} items")
            return items
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to fetch items: {e}")
            return []
    
    def fetch_photo_details(self, photo_id: int) -> Optional[Dict[str, Any]]:
        """Fetch photo metadata."""
        try:
            response = self.session.get(f'{self.api_base}/project/photos/{photo_id}')
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Failed to fetch photo {photo_id}: {e}")
            return None
    
    def fetch_transcription_details(self, transcription_id: int) -> Optional[Dict[str, Any]]:
        """Fetch transcription by ID."""
        try:
            response = self.session.get(f'{self.api_base}/project/transcriptions/{transcription_id}')
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Failed to fetch transcription {transcription_id}: {e}")
            return None
    
    def save_note_to_photo(self, photo_id: int, note_text: str) -> bool:
        """Save note to photo."""
        try:
            post_url = f"{self.api_base}/project/notes"
            note_data = [('photo', photo_id), ('html', note_text)]
            response = self.session.post(post_url, data=note_data)
            response.raise_for_status()
            
            # Apply delay after successful save
            time.sleep(self.config.API_NOTE_DELAY)
            return True
        except Exception as e:
            logger.error(f"Failed to save note for photo {photo_id}: {e}")
            return False

def extract_transcription(photo_data: Dict[str, Any], api_client: TropyAPIClient) -> str:
    """Extract transcription text from photo data."""
    transcription_text = ""
    transcriptions = photo_data.get('transcriptions', [])
    
    if transcriptions and isinstance(transcriptions, list) and len(transcriptions) > 0:
        transcription_id = transcriptions[0]
        try:
            transcription_data = api_client.fetch_transcription_details(transcription_id)
            if transcription_data:
                transcription_text = transcription_data.get('text', '')
                if transcription_text is None:
                    transcription_text = ""
        except Exception as e:
            logger.error(f"Failed to fetch transcription {transcription_id}: {e}")
    
    return transcription_text.strip()

print("✅ Tropy API client defined")

In [None]:
# Cell 7: Initialize All Processors

api_client = TropyAPIClient(config)
image_processor = ImageProcessor(config)
model_processor = get_model_adapter(config) 

print(f"✅ All processors initialized with {globals().get('selected_model_provider', 'gemini')} model")

print("✅ All processors initialized and ready")

## 3. Item Selection

In [None]:
# Cell 8: Select Items to Process

# Fetch all items
all_items_data = api_client.fetch_all_items()

if not all_items_data:
    print("❌ No items found. Please check Tropy connection.")
    items_to_process = []
    selected_item_ids = []  # Track selection even if empty
else:
    print(f"📚 Found {len(all_items_data)} total items in Tropy\n")
    
    print("🎯 Select processing mode:")
    print("A. All items - Process entire project")
    print("B. Single item - Process specific item(s)")
    print("C. List - Process items in specific list(s)")
    print("\nExamples: A, B 123, C 1,3")
    
    mode_input = input("\nYour choice: ").strip().upper()
    
    if not mode_input:
        mode_input = "C 1"  # Default
    
    # Parse input
    parts = mode_input.split()
    choice = parts[0]
    ids = []
    
    if len(parts) > 1:
        id_string = ' '.join(parts[1:])
        ids = [id.strip() for id in id_string.split(',') if id.strip()]
    
    items_to_process = []
    selected_item_ids = []  
    
    if choice == 'A':
        items_to_process = all_items_data
        selected_item_ids = [item['id'] for item in items_to_process] 
        print(f"\n✅ Selected ALL items ({len(items_to_process)} items)")
    
    elif choice == 'B':
        if ids:
            for item_id in ids:
                for item in all_items_data:
                    if str(item.get('id')) == str(item_id):
                        items_to_process.append(item)
                        selected_item_ids.append(item['id']) 
                        print(f"✓ Found item {item_id}: {item.get('title', 'Untitled')}")
                        break
    
    elif choice == 'C':
        if ids:
            items_set = set()
            for list_id in ids:
                list_id_int = int(list_id)
                list_items = [item for item in all_items_data 
                             if list_id_int in item.get('lists', [])]
                if list_items:
                    print(f"✓ List {list_id}: {len(list_items)} items")
                    for item in list_items:
                        if item['id'] not in items_set:
                            items_set.add(item['id'])
                            items_to_process.append(item)
                            selected_item_ids.append(item['id'])  
        else:
            # Default to list 1
            items_to_process = [item for item in all_items_data 
                               if 1 in item.get('lists', [])]
            selected_item_ids = [item['id'] for item in items_to_process]  
    
    # Store selection info globally for use in other cells
    globals()['current_selection'] = {
        'mode': choice,
        'ids': ids,
        'item_ids': selected_item_ids,
        'items': items_to_process
    }
    
    # Summary
    if items_to_process:
        total_photos = sum(len(item.get('photos', [])) for item in items_to_process)
        print(f"\n📊 Summary:")
        print(f"   Items: {len(items_to_process)}")
        print(f"   Photos: {total_photos}")
        print(f"\n💾 Selection stored for consistent processing throughout notebook")  

In [None]:
# Cell 9: Analyze Collection & Estimate Time (item selection-aware checkpoint)

def estimate_processing_time(num_photos: int, seconds_per_photo: float = 15.0) -> Dict[str, Any]:
    """Estimate processing time with realistic values."""
    total_seconds = num_photos * seconds_per_photo
    hours = int(total_seconds // 3600)
    minutes = int((total_seconds % 3600) // 60)
    seconds = int(total_seconds % 60)
    
    # Model-aware cost estimation
    selected_provider = globals().get('selected_model_provider', 'gemini')
    
    # Approximate cost per photo for different models
    cost_per_photo = {
        'gemini': 0.001,      # Gemini Flash is very cheap
        'openai': 0.003,      # GPT-4o-mini is inexpensive
        'claude': 0.015       # Claude 3.5 Sonnet
    }
    
    api_cost = num_photos * cost_per_photo.get(selected_provider, 0.001)
    
    return {
        'total_seconds': total_seconds,
        'formatted_time': f"{hours}h {minutes}m {seconds}s" if hours > 0 else f"{minutes}m {seconds}s",
        'api_cost_estimate': api_cost,
        'seconds_per_photo': seconds_per_photo
    }

if 'items_to_process' in locals() and items_to_process:
    print("\n📊 Analyzing Collection...")
    
    # Get all photo IDs from selected items
    selected_photo_ids = set()
    for item in items_to_process:
        selected_photo_ids.update(item.get('photos', []))
    
    total_photos = len(selected_photo_ids)
    
    # Check for existing checkpoint and count only relevant processed photos
    already_processed_in_selection = 0
    
    if os.path.exists(config.CHECKPOINT_FILE):
        try:
            with open(config.CHECKPOINT_FILE, 'r') as f:
                checkpoint_data = json.load(f)
                checkpoint_photo_ids = set(checkpoint_data.get('processed_photo_ids', []))
                
                # Count only photos that are both in checkpoint AND in current selection
                already_processed_in_selection = len(selected_photo_ids & checkpoint_photo_ids)
                
                if checkpoint_photo_ids:
                    print(f"\n📌 Checkpoint found: {len(checkpoint_photo_ids)} total photos in checkpoint")
                    if already_processed_in_selection > 0:
                        print(f"   ✅ {already_processed_in_selection} of your selected photos already processed")
        except:
            pass
    
    photos_to_process = total_photos - already_processed_in_selection
    
    print(f"\n⏱️  Processing Estimates:")
    print(f"   Photos in selection: {total_photos}")
    if already_processed_in_selection > 0:
        print(f"   Already processed: {already_processed_in_selection}")
    print(f"   Photos to process: {photos_to_process}")
    
    if photos_to_process > 0:
        time_estimate = estimate_processing_time(photos_to_process)
        print(f"   Estimated time: {time_estimate['formatted_time']}")
        print(f"   (~{time_estimate['seconds_per_photo']:.0f} seconds per photo)")
        print(f"   Estimated cost: ~${time_estimate['api_cost_estimate']:.2f}")
        
        print("\n💡 Recommendations:")
        if photos_to_process <= 20:
            print("   ⚡ Small batch - should complete in a few minutes")
        elif photos_to_process <= 100:
            print("   ☕ Medium batch - good time for a coffee break")
        elif photos_to_process <= 500:
            print("   🍽️  Large batch - consider running during lunch")
        else:
            print("   🌙 Very large batch - consider running overnight")
    else:
        print("\n✅ All selected photos have already been processed!")
        print("   Run Cell 13 to add item summaries, or")
        print("   Select different items to process new photos")
    
    print("\n⚠️  Note: Actual time may vary based on:")
    print("   - Image complexity and size")
    print("   - Length of transcriptions")
    print("   - Network speed and API response times")
    print("   - System performance")
    
    if photos_to_process > 0:
        print("\n👉 Ready to process? Run the next cell to start!")

## 4. Main Processing Pipeline

In [None]:
# Cell 10: Processing Functions and Classes

class BatchProcessor:
    """Handles batch processing with checkpointing."""
    
    def __init__(self, config):
        self.config = config
        self.checkpoint_data = self._load_checkpoint()
    
    def _load_checkpoint(self) -> Dict[str, Any]:
        """Load checkpoint if exists."""
        try:
            if os.path.exists(self.config.CHECKPOINT_FILE):
                with open(self.config.CHECKPOINT_FILE, 'r') as f:
                    data = json.load(f)
                data['processed_photo_ids'] = set(data['processed_photo_ids'])
                logger.info(f"Loaded checkpoint: {len(data['processed_photo_ids'])} photos processed")
                return data
        except Exception as e:
            logger.warning(f"Could not load checkpoint: {e}")
        
        return {
            'processed_photo_ids': set(),
            'processed_photos': 0,
            'last_update': None,
            'embeddings_buffer': [],
            'item_summaries_buffer': []
        }
    
    def _save_checkpoint(self):
        """Save progress to checkpoint."""
        try:
            checkpoint = {
                'processed_photo_ids': list(self.checkpoint_data['processed_photo_ids']),
                'processed_photos': self.checkpoint_data['processed_photos'],
                'last_update': datetime.now().isoformat(),
                'embeddings_buffer': self.checkpoint_data['embeddings_buffer'],
                'item_summaries_buffer': self.checkpoint_data['item_summaries_buffer']
            }
            with open(self.config.CHECKPOINT_FILE, 'w') as f:
                json.dump(checkpoint, f, indent=2)
        except Exception as e:
            logger.error(f"Failed to save checkpoint: {e}")
    
    def _save_embeddings_batch(self, force: bool = False):
        """Save embeddings buffer to file."""
        if not self.checkpoint_data['embeddings_buffer']:
            return
        
        if force or len(self.checkpoint_data['embeddings_buffer']) >= self.config.BATCH_SIZE:
            try:
                combined_data = {'photo_embeddings': [], 'item_summaries': []}
                if os.path.exists(self.config.EMBEDDINGS_FILE):
                    with open(self.config.EMBEDDINGS_FILE, 'r') as f:
                        existing_data = json.load(f)
                        if isinstance(existing_data, list):
                            combined_data['photo_embeddings'] = existing_data
                        else:
                            combined_data = existing_data
                
                combined_data['photo_embeddings'].extend(self.checkpoint_data['embeddings_buffer'])
                
                with open(self.config.EMBEDDINGS_FILE, 'w') as f:
                    json.dump(combined_data, f, indent=2)
                
                logger.info(f"Saved {len(self.checkpoint_data['embeddings_buffer'])} embeddings")
                self.checkpoint_data['embeddings_buffer'] = []
            except Exception as e:
                logger.error(f"Failed to save embeddings: {e}")

def generate_item_summary(item_title: str, photo_summaries: List[Dict[str, Any]], 
                         context_str: str = "") -> Optional[str]:
    """Generate a comprehensive summary for the entire item."""
    if not photo_summaries:
        return None
    
    # Compile all photo summaries
    all_summaries = []
    for ps in photo_summaries:
        summary_text = f"Page {ps.get('page', 0)}: {ps.get('summary', '')}"
        all_summaries.append(summary_text)
    
    combined_summaries = "\n".join(all_summaries)
    
    # The historian prompt for item summaries
    prompt = f"""
    You are a highly specialized historian and archival researcher. You have access to the following photo summaries. Your task is to synthesize a comprehensive, scholarly-level overall summary of this archival item.
    
    A clear suggested title for the item at the beginning (write "Suggested title: ..." explicitly in the first line).
    
    For the Suggested title, prioritize specificity over generality whenever possible:
    - The title must include, if identifiable:
      1. The specific person or family name.
      2. The type of request or situation (e.g., visa, baptism, financial aid, employment reinstatement).
      3. Brief historical context (e.g., date range, relevant laws, locations).
    - If no individual or family is clearly identifiable, provide a concise descriptive title summarizing the main content.
    - Avoid generic collection-level titles that repeat provenance information already known.
    
    Then, in your summary, describe:
    - What the document is and its general purpose.
    - The institutions or departments involved.
    - Historical context and relevant dates.
    - Physical or archival structure (pages, folders, seals, stamps).
    - Main content and themes (cases, people, humanitarian aspects).
    - Any key individuals or cases if identifiable.
    - Why this item is historically significant.
    - Evidence of authenticity (stamps, handwriting, watermarks, archival features).
    
    Instructions:
    
    Integrate information smoothly rather than listing separate headings.
    Use the photo summaries as your main evidence.
    Be transparent if some information is unclear or missing.
    Do not fabricate or guess beyond provided photo summaries.
    
    Context from previous pages (if any):
    {context_str}
    
    **Note: This summary is machine-generated and should be reviewed by a researcher.**
    
    Current item title: {item_title}
    
    Photo summaries:
    {combined_summaries}
    """
    
    try:
        # Get the selected provider
        provider = globals().get('selected_model_provider', 'gemini')
        
        if provider == 'gemini':
            import google.generativeai as genai
            model = genai.GenerativeModel(config.GENERATIVE_MODEL)
            response = model.generate_content(prompt)
            return response.text if response else None
            
        elif provider == 'openai':
            from openai import OpenAI
            client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
            response = client.chat.completions.create(
                model=config.GENERATIVE_MODEL,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1000
            )
            return response.choices[0].message.content
            
        elif provider == 'claude':
            import anthropic
            client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
            message = client.messages.create(
                model=config.GENERATIVE_MODEL,
                max_tokens=1000,
                messages=[{"role": "user", "content": prompt}]
            )
            return message.content[0].text
            
    except Exception as e:
        logger.error(f"Failed to generate item summary: {e}")
        return None

def save_current_run_summaries(summaries: List[Dict[str, Any]]) -> str:
    """Save summaries for current run to a separate timestamped file."""
    try:
        os.makedirs(config.OUTPUT_DIR, exist_ok=True)
        
        # Create timestamped filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"item_summaries_run_{timestamp}.json"
        filepath = os.path.join(config.OUTPUT_DIR, filename)
        
        # Save summaries
        with open(filepath, 'w') as f:
            json.dump(summaries, f, indent=2)
        
        logger.info(f"Saved {len(summaries)} summaries to {filepath}")
        return filepath
        
    except Exception as e:
        logger.error(f"Failed to save current run summaries: {e}")
        return None

def save_item_summaries(summaries: List[Dict[str, Any]], append_to_main: bool = True) -> None:
    """Save item summaries to JSON file."""
    try:
        output_file = os.path.join(config.OUTPUT_DIR, "item_summaries.json")
        os.makedirs(config.OUTPUT_DIR, exist_ok=True)
        
        if append_to_main:
            # Original behavior - append to main file
            existing_summaries = []
            if os.path.exists(output_file):
                with open(output_file, 'r') as f:
                    existing_summaries = json.load(f)
            
            existing_summaries.extend(summaries)
            
            with open(output_file, 'w') as f:
                json.dump(existing_summaries, f, indent=2)
            
            logger.info(f"Saved {len(summaries)} item summaries to {output_file}")
        
        # Always save current run separately
        current_run_file = save_current_run_summaries(summaries)
        
        # Store current run file path globally
        globals()['current_run_summaries_file'] = current_run_file
        
    except Exception as e:
        logger.error(f"Failed to save item summaries: {e}")

print("✅ Processing functions ready")

In [None]:
# Cell 11: Main Processing Loop

def process_collection(processor, items, generate_summaries=True, 
                      generate_embeddings=True, generate_item_summaries=True):
    """Process items with progress tracking."""
    
    results = {
        'processed_photos': 0,
        'summaries_created': 0,
        'embeddings_created': 0,
        'item_summaries_created': 0,
        'errors': 0
    }
    
    start_time = time.time()
    
    # Count total photos and photos that need processing
    total_photos = sum(len(item.get('photos', [])) for item in items)
    photos_to_process = 0
    skipped_photos = 0
    
    # Count photos that will actually be processed (not in checkpoint)
    for item in items:
        for photo_id in item.get('photos', []):
            if photo_id not in processor.checkpoint_data['processed_photo_ids']:
                photos_to_process += 1
            else:
                skipped_photos += 1
    
    print(f"\n🚀 Starting processing for {len(items)} items...")
    print(f"   Total photos in selection: {total_photos}")
    if skipped_photos > 0:
        print(f"   Already processed (skipping): {skipped_photos}")
    print(f"   Photos to process: {photos_to_process}")
    print("🛡️  Checkpointing enabled - safe to interrupt\n")
    
    # Use photos_to_process for progress bar instead of total_photos
    with tqdm(total=photos_to_process, desc="Processing new photos", unit="photo") as pbar:
        # Don't update for already processed photos
        
        for item in items:
            item_id = item.get('id')
            item_title = item.get('title', 'Untitled')
            photo_ids = item.get('photos', [])
            item_photo_summaries = []
            
            for photo_id in photo_ids:
                try:
                    # Skip if already processed (don't update progress bar)
                    if photo_id in processor.checkpoint_data['processed_photo_ids']:
                        continue
                    
                    # Fetch photo data
                    photo_data = api_client.fetch_photo_details(photo_id)
                    if not photo_data:
                        results['errors'] += 1
                        pbar.update(1)
                        continue
                    
                    # Load image
                    image = image_processor.load_image(photo_data)
                    if not image:
                        results['errors'] += 1
                        pbar.update(1)
                        continue
                    
                    # Extract transcription
                    transcription = extract_transcription(photo_data, api_client)
                    
                    # Generate summary
                    summary_text = ""
                    if generate_summaries:
                        summary = model_processor.generate_summary(image, transcription)
                        if summary:
                            # Format and save note
                            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
                            header = (
                                f"<div style='background-color: #f9f9f9; padding: 10px; margin-bottom: 10px;'>"
                                f"<strong>🤖 Machine-generated photo summary</strong><br/>"
                                f"<small>Generated on {timestamp} by {config.GENERATIVE_MODEL}</small>"
                            )
                            if transcription:
                                header += "<br/><small>✅ Used transcription data</small>"
                            header += "</div>"
                            
                            formatted_summary = f"{header}<p>{summary}</p>"
                            
                            if api_client.save_note_to_photo(photo_id, formatted_summary):
                                results['summaries_created'] += 1
                                summary_text = summary
                                item_photo_summaries.append({
                                    'photo_id': photo_id,
                                    'summary': summary,
                                    'filename': photo_data.get('filename', ''),
                                    'page': photo_data.get('page', 0),
                                    'transcription': transcription[:500] if transcription else ""
                                })
                    
                    # Generate embedding
                    if generate_embeddings:
                        embedding = model_processor.generate_embedding(image, transcription)
                        if embedding:
                            processor.checkpoint_data['embeddings_buffer'].append({
                                'item_id': item_id,
                                'photo_id': photo_id,
                                'item_title': item_title,
                                'embedding': embedding,
                                'summary': summary_text,
                                'timestamp': datetime.now().isoformat()
                            })
                            results['embeddings_created'] += 1
                            processor._save_embeddings_batch()
                    
                    # Update checkpoint
                    processor.checkpoint_data['processed_photo_ids'].add(photo_id)
                    processor.checkpoint_data['processed_photos'] += 1
                    results['processed_photos'] += 1
                    
                    if processor.checkpoint_data['processed_photos'] % processor.config.CHECKPOINT_INTERVAL == 0:
                        processor._save_checkpoint()
                    
                    del image  # Free memory
                    
                except KeyboardInterrupt:
                    logger.info("Interrupted - saving progress...")
                    processor._save_checkpoint()
                    processor._save_embeddings_batch(force=True)
                    raise
                except Exception as e:
                    logger.error(f"Error processing photo {photo_id}: {e}")
                    results['errors'] += 1
                finally:
                    pbar.update(1)
            
            # Generate item summary if needed
            if generate_item_summaries and item_photo_summaries:
                try:
                    item_summary = generate_item_summary(item_title, item_photo_summaries)
                    
                    if item_summary:
                        # Extract suggested title from the first line
                        lines = item_summary.split("\n")
                        suggested_title = ""
                        for line in lines:
                            if line.strip().lower().startswith("suggested title:"):
                                suggested_title = line.replace("Suggested title:", "").replace("suggested title:", "").strip()
                                break
                        
                        # Add to buffer for saving
                        processor.checkpoint_data['item_summaries_buffer'].append({
                            'item_id': item_id,
                            'item_title': item_title,
                            'photo_count': len(photo_ids),
                            'item_summary': item_summary,
                            'suggested_title': suggested_title,
                            'timestamp': datetime.now().isoformat(),
                            'model': config.GENERATIVE_MODEL
                        })
                        results['item_summaries_created'] += 1
                except Exception as e:
                    logger.error(f"Failed to generate item summary for {item_title}: {e}")
    
    # Final save
    processor._save_checkpoint()
    processor._save_embeddings_batch(force=True)
    
    # Save item summaries if any
    if processor.checkpoint_data['item_summaries_buffer']:
        save_item_summaries(processor.checkpoint_data['item_summaries_buffer'])
    
    # Results summary
    elapsed = time.time() - start_time
    
    # Calculate actual processing rate
    if results['processed_photos'] > 0:
        seconds_per_photo = elapsed / results['processed_photos']
        print(f"\n✅ Processing completed!")
        print(f"   Photos processed: {results['processed_photos']}")
        print(f"   Photos skipped: {skipped_photos}")
        print(f"   Summaries created: {results['summaries_created']}")
        print(f"   Item summaries: {results['item_summaries_created']}")
        print(f"   Embeddings created: {results['embeddings_created']}")
        print(f"   Errors: {results['errors']}")
        print(f"   Time: {elapsed/60:.1f} minutes")
        print(f"   Speed: {seconds_per_photo:.1f} seconds/photo")
    else:
        print(f"\n✅ No new photos to process!")
        print(f"   All {total_photos} photos were already processed")
    
    return results

print("✅ Main processing function ready")

In [None]:
# Cell 12: Execute Processing

if 'items_to_process' in locals() and items_to_process:
    # Create batch processor
    batch_processor = BatchProcessor(config)
    
    # Run processing
    results = process_collection(
        batch_processor,
        items_to_process,
        generate_summaries=True,
        generate_embeddings=True,
        generate_item_summaries=True
    )
    
    print("\n💡 Next steps:")
    print("   1. Run Cell 13 to add item summaries to Tropy")
    print("   2. Check your Tropy project for the new notes!")
else:
    print("❌ No items selected. Please run item selection cells first.")

## 5. Post-Processing

In [None]:
# Cell 13: Add Item Summaries to Tropy (Selection-Aware)

def add_item_summaries_to_tropy(use_current_run_only=True):
    """Read generated item summaries and add them as notes to items."""
    
    print("📚 Adding item summaries to Tropy...")
    
    # Determine which summaries to use
    if use_current_run_only and 'current_run_summaries_file' in globals():
        # Use only summaries from current run
        summaries_file = globals()['current_run_summaries_file']
        print(f"✅ Using current run summaries only")
    else:
        # Use main summaries file
        summaries_file = os.path.join(config.OUTPUT_DIR, "item_summaries.json")
        print(f"⚠️  Using all summaries from main file")
    
    # Check if summaries file exists
    if not os.path.exists(summaries_file):
        print(f"❌ No summaries file found: {summaries_file}")
        return
    
    # Load the summaries
    with open(summaries_file, 'r') as f:
        all_summaries = json.load(f)
    
    # Filter summaries based on current selection if available
    if 'current_selection' in globals() and use_current_run_only:
        selected_item_ids = globals()['current_selection']['item_ids']
        filtered_summaries = [s for s in all_summaries if s['item_id'] in selected_item_ids]
        print(f"✅ Filtered to {len(filtered_summaries)} summaries matching current selection")
        print(f"   Selection mode: {globals()['current_selection']['mode']}")
        if globals()['current_selection']['ids']:
            print(f"   Selected IDs: {', '.join(globals()['current_selection']['ids'])}")
        summaries_to_process = filtered_summaries
    else:
        summaries_to_process = all_summaries
    
    print(f"\n📊 Processing {len(summaries_to_process)} item summaries")
    
    added_count = 0
    skipped_count = 0
    error_count = 0
    replaced_count = 0
    
    # Process with progress bar
    with tqdm(total=len(summaries_to_process), desc="Adding item summaries", unit="item") as pbar:
        for summary_data in summaries_to_process:
            item_id = summary_data['item_id']
            item_title = summary_data.get('item_title', 'Untitled')
            item_summary = summary_data.get('item_summary', '')
            photo_count = summary_data.get('photo_count', 0)
            suggested_title = summary_data.get('suggested_title', '')
            
            # Update progress bar description
            pbar.set_description(f"Processing: {item_title[:30]}...")
            
            if not item_summary:
                skipped_count += 1
                pbar.update(1)
                continue
            
            try:
                # Get the item to find its photos
                item_data = api_client.session.get(f"{api_client.api_base}/project/items/{item_id}").json()
                photos = item_data.get('photos', [])
                
                if not photos:
                    skipped_count += 1
                    pbar.update(1)
                    continue
                
                # Use the FIRST photo to attach the item summary
                first_photo_id = photos[0]
                
                # Check if this photo already has an item summary
                photo_data = api_client.fetch_photo_details(first_photo_id)
                if photo_data:
                    notes = photo_data.get('notes', [])
                    has_item_summary = False
                    existing_note_id = None
                    
                    for note_id in notes:
                        note_response = api_client.session.get(f"{api_client.api_base}/project/notes/{note_id}")
                        if note_response.status_code == 200:
                            note_data = note_response.json()
                            note_html = note_data.get('html', '')
                            if 'Machine-generated overall item summary' in note_html:
                                has_item_summary = True
                                existing_note_id = note_id
                                
                                # Apply batch decision if set
                                if globals().get('batch_replace_decision') == 'skip_all':
                                    skipped_count += 1
                                    pbar.update(1)
                                    continue
                                elif globals().get('batch_replace_decision') == 'replace_all':
                                    # Delete existing note
                                    delete_response = api_client.session.delete(
                                        f"{api_client.api_base}/project/notes/{existing_note_id}"
                                    )
                                    if delete_response.status_code in [200, 204]:
                                        has_item_summary = False
                                        replaced_count += 1
                                else:
                                    # First time encountering existing summary - ask user
                                    print(f"\n⚠️  Item '{item_title}' already has a summary")
                                    print("   What would you like to do?")
                                    print("   1. Skip this item")
                                    print("   2. Replace with new summary")
                                    print("   3. Skip all existing (default)")
                                    print("   4. Replace all existing")
                                    
                                    decision = input("   Choice (1-4) [3]: ").strip() or "3"
                                    
                                    if decision == "4":
                                        globals()['batch_replace_decision'] = 'replace_all'
                                        # Delete existing note
                                        delete_response = api_client.session.delete(
                                            f"{api_client.api_base}/project/notes/{existing_note_id}"
                                        )
                                        if delete_response.status_code in [200, 204]:
                                            has_item_summary = False
                                            replaced_count += 1
                                    elif decision == "3":
                                        globals()['batch_replace_decision'] = 'skip_all'
                                        skipped_count += 1
                                        pbar.update(1)
                                        continue
                                    elif decision == "2":
                                        # Delete existing note for this item only
                                        delete_response = api_client.session.delete(
                                            f"{api_client.api_base}/project/notes/{existing_note_id}"
                                        )
                                        if delete_response.status_code in [200, 204]:
                                            has_item_summary = False
                                            replaced_count += 1
                                    else:
                                        skipped_count += 1
                                        pbar.update(1)
                                        continue
                
                if not has_item_summary:
                    # Create the formatted item summary note
                    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
                    
                    # Gray border for item summaries
                    item_summary_header = (
                        "<div style='background-color: #f5f5f5; padding: 12px; margin-bottom: 10px; "
                        "border-left: 4px solid #999999;'>"
                        "<strong>📚 Machine-generated overall item summary</strong><br/>"
                        f"<small>Generated on {timestamp} by {config.GENERATIVE_MODEL}</small><br/>"
                        f"<small>Based on {photo_count} photo summaries</small>"
                    )
                    if suggested_title:
                        item_summary_header += f"<br/><small>📝 Suggested title: {suggested_title}</small>"
                    item_summary_header += "</div>"
                    
                    formatted_summary = f"{item_summary_header}<p>{item_summary}</p>"
                    
                    # Save the note to Tropy (to first photo)
                    if api_client.save_note_to_photo(first_photo_id, formatted_summary):
                        added_count += 1
                    else:
                        error_count += 1
                        
            except Exception as e:
                logger.error(f"Error processing item {item_id}: {e}")
                error_count += 1
            finally:
                pbar.update(1)
    
    # Clear batch decision for next run
    if 'batch_replace_decision' in globals():
        del globals()['batch_replace_decision']
    
    # Summary
    print(f"\n{'='*50}")
    print(f"📊 ITEM SUMMARY ADDITION COMPLETE")
    print(f"{'='*50}")
    print(f"✅ Added: {added_count} new summaries")
    if replaced_count > 0:
        print(f"🔄 Replaced: {replaced_count} existing summaries")
    print(f"⚠️  Skipped: {skipped_count}")
    print(f"❌ Errors: {error_count}")
    print(f"📋 Total processed: {len(summaries_to_process)}")
    
    if added_count > 0 or replaced_count > 0:
        print(f"\n🎉 Item summaries are attached to the FIRST photo of each item!")

# Interactive menu for adding summaries
def add_summaries_menu():
    """Interactive menu for adding summaries with options."""
    print("\n📚 Add Item Summaries to Tropy")
    print("=" * 50)
    
    # Show current context
    if 'current_selection' in globals():
        selection = globals()['current_selection']
        print(f"Current selection: {selection['mode']}", end="")
        if selection['ids']:
            print(f" - IDs: {', '.join(selection['ids'])}")
        else:
            print()
        print(f"Selected items: {len(selection['item_ids'])}")
    
    print("\nOptions:")
    print("1. Add summaries for CURRENT RUN only (recommended)")
    print("2. Add summaries for ALL items in main file")
    print("3. View current run summary")
    print("4. Cancel")
    
    choice = input("\nYour choice (1-4) [1]: ").strip() or "1"
    
    if choice == "1":
        add_item_summaries_to_tropy(use_current_run_only=True)
    elif choice == "2":
        confirm = input("\n⚠️  This will process ALL summaries. Continue? (y/n) [n]: ").strip().lower()
        if confirm == 'y':
            add_item_summaries_to_tropy(use_current_run_only=False)
    elif choice == "3":
        if 'current_run_summaries_file' in globals():
            print(f"\nCurrent run summaries saved to:")
            print(f"  {globals()['current_run_summaries_file']}")
            
            # Show summary count
            with open(globals()['current_run_summaries_file'], 'r') as f:
                summaries = json.load(f)
            print(f"  Contains: {len(summaries)} item summaries")
        else:
            print("\n❌ No current run summaries found")
    else:
        print("\n❌ Cancelled")

# Run the menu
add_summaries_menu()

## 6. Utility Functions

In [None]:
# Cell 14: Clean Slate - Remove AI Notes

def get_note_content(note_id):
    """Extract note content from Tropy API response."""
    try:
        response = api_client.session.get(f"{api_client.api_base}/project/notes/{note_id}")
        if response.status_code != 200:
            return None
        
        note_data = response.json()
        
        # Handle different response structures
        if isinstance(note_data, dict):
            # Check for html field
            if 'html' in note_data:
                if isinstance(note_data['html'], dict) and '@value' in note_data['html']:
                    return note_data['html']['@value']
                else:
                    return str(note_data['html'])
            # Check for text field
            elif 'text' in note_data:
                if isinstance(note_data['text'], dict) and '@value' in note_data['text']:
                    return note_data['text']['@value']
                else:
                    return str(note_data['text'])
        
        return str(note_data)
    except Exception as e:
        logger.debug(f"Error getting note content: {e}")
        return None

def is_ai_generated(note_text):
    """Check if a note is AI-generated."""
    if not note_text:
        return False
    
    ai_patterns = [
        'Machine-generated photo summary',
        'Machine-generated overall item summary',
        '🤖 Machine-generated photo summary',
        '📚 Machine-generated overall item summary',
        'Generated on 2025-',
        'Generated on 2024-',
        'by gemini-1.5-flash',
        'ai-generated',
        'auto-generated',
        'automatically generated',
        'claude',
        'chatgpt',
        'gpt-'
    ]
    
    note_lower = note_text.lower()
    return any(pattern.lower() in note_lower for pattern in ai_patterns)

def delete_note_multiple_methods(note_id):
    """Try multiple methods to delete a note."""
    # Try different endpoints and methods
    endpoints_to_try = [
        ('DELETE', f'{api_client.api_base}/project/notes/{note_id}'),
        ('DELETE', f'{api_client.api_base}/notes/{note_id}'),
        ('POST', f'{api_client.api_base}/project/notes/{note_id}/delete'),
        ('POST', f'{api_client.api_base}/project/notes/{note_id}', {'action': 'delete'}),
        ('PUT', f'{api_client.api_base}/project/notes/{note_id}', {'html': '', 'deleted': True}),
    ]
    
    for method_info in endpoints_to_try:
        method = method_info[0]
        url = method_info[1]
        data = method_info[2] if len(method_info) > 2 else None
        
        try:
            if method == 'DELETE':
                res = api_client.session.delete(url)
            elif method == 'POST':
                res = api_client.session.post(url, data=data)
            elif method == 'PUT':
                res = api_client.session.put(url, json=data)
            
            if res.status_code in [200, 204, 404]:
                return True
        except:
            pass
    
    # Try one more method - empty the note
    try:
        empty_note_url = f'{api_client.api_base}/project/notes/{note_id}'
        res = api_client.session.put(empty_note_url, data=[('html', '')])
        if res.status_code in [200, 204]:
            return True
    except:
        pass
    
    return False

def remove_ai_notes():
    """Remove AI-generated notes from Tropy."""
    
    print("🧹 CLEAN SLATE - AI Notes Remover")
    print("=" * 50)
    
    # Check connection
    try:
        test_response = api_client.session.get(f"{api_client.api_base}/project/items/")
        if test_response.status_code != 200:
            print("❌ Cannot connect to Tropy. Please check if Tropy is running with REST API enabled.")
            return
    except Exception as e:
        print(f"❌ Connection error: {e}")
        return
    
    print("✅ Connected to Tropy successfully!\n")
    
    # Get all items
    all_items = api_client.fetch_all_items()
    if not all_items:
        print("❌ No items found in Tropy.")
        return
    
    print(f"📚 Found {len(all_items)} items in Tropy\n")
    
    # Selection menu
    print("Select what to clean:")
    print("A. All items - Remove AI notes from entire project")
    print("B. Single item - Remove from specific item(s)")
    print("C. List - Remove from specific list(s)")
    
    choice_input = input("\nYour choice: ").strip().upper()
    
    if not choice_input:
        print("❌ No input provided. Cancelled.")
        return
    
    # Parse input
    parts = choice_input.split()
    choice = parts[0]
    ids = []
    
    if len(parts) > 1:
        id_string = ' '.join(parts[1:])
        ids = [id.strip() for id in id_string.split(',') if id.strip()]
    
    # Determine which items to process
    items_to_clean = []
    
    if choice == 'A':
        items_to_clean = all_items
        print(f"\n⚠️  This will scan ALL {len(items_to_clean)} items for AI notes.")
        
    elif choice == 'B':
        if ids:
            for item_id in ids:
                for item in all_items:
                    if str(item.get('id')) == str(item_id):
                        items_to_clean.append(item)
                        print(f"✓ Found item {item_id}: {item.get('title', 'Untitled')}")
                        break
        else:
            print("❌ No item IDs provided.")
            return
            
    elif choice == 'C':
        if ids:
            items_set = set()
            for list_id in ids:
                list_id_int = int(list_id)
                list_items = [item for item in all_items 
                             if list_id_int in item.get('lists', [])]
                if list_items:
                    print(f"✓ List {list_id}: {len(list_items)} items")
                    for item in list_items:
                        if item['id'] not in items_set:
                            items_set.add(item['id'])
                            items_to_clean.append(item)
        else:
            print("❌ No list IDs provided.")
            return
    else:
        print("❌ Invalid choice.")
        return
    
    if not items_to_clean:
        print("\n❌ No items selected for cleaning.")
        return
    
    # Scan for AI notes
    print(f"\n🔍 Scanning {len(items_to_clean)} items for AI-generated notes...")
    
    ai_notes_found = []
    total_photos_scanned = 0
    
    for item in tqdm(items_to_clean, desc="Scanning items"):
        item_id = item.get('id')
        photos = item.get('photos', [])
        
        for photo_id in photos:
            total_photos_scanned += 1
            
            try:
                # Get photo details
                photo_data = api_client.fetch_photo_details(photo_id)
                if not photo_data:
                    continue
                
                # Check each note
                notes = photo_data.get('notes', [])
                for note_id in notes:
                    # Get note content with proper extraction
                    note_text = get_note_content(note_id)
                    
                    if note_text and is_ai_generated(note_text):
                        ai_notes_found.append({
                            'note_id': note_id,
                            'photo_id': photo_id,
                            'item_id': item_id,
                            'item_title': item.get('title', 'Untitled'),
                            'is_item_summary': 'overall item summary' in note_text.lower()
                        })
                        
            except Exception as e:
                logger.debug(f"Error checking photo {photo_id}: {e}")
    
    print(f"\n📊 Scan complete:")
    print(f"   Photos scanned: {total_photos_scanned}")
    print(f"   AI notes found: {len(ai_notes_found)}")
    
    if not ai_notes_found:
        print("\n✨ No AI-generated notes found! Nothing to clean.")
        return
    
    # Show summary by type
    photo_summaries = [n for n in ai_notes_found if not n['is_item_summary']]
    item_summaries = [n for n in ai_notes_found if n['is_item_summary']]
    
    print(f"\n📝 Found:")
    print(f"   Photo summaries: {len(photo_summaries)}")
    print(f"   Item summaries: {len(item_summaries)}")
    
    # Show a few examples
    print("\n📋 Examples of notes found:")
    for i, note in enumerate(ai_notes_found[:3]):
        note_type = "Item summary" if note['is_item_summary'] else "Photo summary"
        print(f"   {i+1}. {note_type} in '{note['item_title']}'")
    if len(ai_notes_found) > 3:
        print(f"   ... and {len(ai_notes_found) - 3} more")
    
    # Confirmation
    print(f"\n⚠️  WARNING: This will permanently delete {len(ai_notes_found)} notes!")
    confirm = input("Proceed with deletion? (y/n): ").strip().lower()
    
    if confirm not in ['yes', 'y']:
        print("\n❌ Deletion cancelled. Your notes are safe.")
        return
    
    # Delete notes
    print(f"\n🗑️  Deleting {len(ai_notes_found)} AI notes...")
    
    deleted_count = 0
    failed_count = 0
    
    for note_info in tqdm(ai_notes_found, desc="Deleting notes"):
        note_id = note_info['note_id']
        
        if delete_note_multiple_methods(note_id):
            deleted_count += 1
        else:
            failed_count += 1
            logger.debug(f"Failed to delete note {note_id}")
        
        # Small delay to avoid overwhelming the API
        if deleted_count % 50 == 0:
            time.sleep(0.5)
    
    # Final summary
    print(f"\n{'='*50}")
    print(f"📊 CLEAN SLATE COMPLETE")
    print(f"{'='*50}")
    print(f"✅ Successfully deleted: {deleted_count} notes")
    if failed_count > 0:
        print(f"❌ Failed to delete: {failed_count} notes")
    print(f"⏱️  Total processed: {len(ai_notes_found)}")
    
    if deleted_count > 0:
        print(f"\n🧹 Your Tropy is now cleaner!")
        print("💡 You can now run the processing pipeline again if needed.")

# Run the function
remove_ai_notes()

In [None]:
# Cell 15: Clean Files and Checkpoints 

import os

# Delete checkpoint
if os.path.exists(config.CHECKPOINT_FILE):
    os.remove(config.CHECKPOINT_FILE)
    print(f"✅ Deleted checkpoint file: {config.CHECKPOINT_FILE}")
else:
    print(f"⚠️  No checkpoint file found: {config.CHECKPOINT_FILE}")

# Delete embeddings file
if os.path.exists(config.EMBEDDINGS_FILE):
    os.remove(config.EMBEDDINGS_FILE)
    print(f"✅ Deleted embeddings file: {config.EMBEDDINGS_FILE}")
else:
    print(f"⚠️  No embeddings file found: {config.EMBEDDINGS_FILE}")

# Delete item summaries file
summaries_file = os.path.join(config.OUTPUT_DIR, "item_summaries.json")
if os.path.exists(summaries_file):
    os.remove(summaries_file)
    print(f"✅ Deleted item summaries file: {summaries_file}")
else:
    print(f"⚠️  No item summaries file found: {summaries_file}")

# Remove empty output directory if it exists
if os.path.exists(config.OUTPUT_DIR) and not os.listdir(config.OUTPUT_DIR):
    os.rmdir(config.OUTPUT_DIR)
    print(f"✅ Removed empty output directory: {config.OUTPUT_DIR}")

print("\n🧹 File cleanup complete!")

# Check if there are run history files
import glob
run_files = glob.glob(os.path.join(config.OUTPUT_DIR, "item_summaries_run_*.json")) if os.path.exists(config.OUTPUT_DIR) else []
if run_files:
    print(f"\n📝 Note: {len(run_files)} timestamped run history files are preserved in {config.OUTPUT_DIR}")
    print("   These files contain summaries from individual processing runs. We intentionally keep them as history/backups for each session. It can be useful to compare results across different runs.")
    print("   Delete them manually if you need a completely clean slate.")