# Tropy Collection Analysis & Tagging Notebook

# Tropy AI Analysis: Semantic Queries, Clustering, and Tagging

### **Goal**
This notebook is the interactive analysis and organization component of the AI-powered research workflow. It uses the vector embeddings and AI-generated summaries created by the `tropy-mmllm-analysis-multimodel.ipynb` notebook to enable powerful new ways of interacting with your collection.

### **Features**
- **Semantic Search:** NLP queries goes beyond keywords.
- **Automated Tagging:** Find relevant items based on your query and programmatically apply tags in Tropy.
- **Theme Discovery (Clustering):** Automatically identify and group semantically similar documents to uncover hidden themes in your collection.
- **Metadata Enhancement:** Use AI-suggested titles from your summaries to automatically update the metadata of your Tropy items.

### **How to Use This Notebook**
1.  **Prerequisite:** Ensure you have already run the `tropy-mmllm-analysis-multimodel.ipynb` notebook to create the `tropy_embeddings.json` and `output/item_summaries.json` files.
2.  **Execution Order:** Run the code cells sequentially from top to bottom.
    *   **Cell 1** loads all necessary libraries and your embeddings data.
    *   **Cell 2** defines all the helper functions for searching, tagging, and analysis.
    *   **Cell 3** is the main interactive menu where you will run your analyses.

In [None]:
# Cell 1: Import Required Libraries 
import json, requests, os, time
import numpy as np
from tqdm.notebook import tqdm
from IPython.display import display, HTML
from dotenv import load_dotenv
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import logging
from datetime import datetime
from typing import Optional, List, Dict, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ All dependencies imported successfully")
print("📝 Note: Using numpy-based similarity search (perfect for collections under 10k photos)")
print("   For larger collections, consider ChromaDB or FAISS")

In [None]:
# Cell 2: Configure API Keys and Select Model (New - matching analysis notebook)
print("🔧 Setting up AI Model Providers...\n")

# Load environment variables
load_dotenv()

# Check and configure API keys
providers_available = []

# Helper function to add/check API key
def check_api_key(provider_name, env_var, display_name):
    if os.getenv(env_var):
        return True
    else:
        print(f"📝 {provider_name} API key not found")
        add_key = input(f"   Add {provider_name} API key? (y/n) [n]: ").strip().lower()
        if add_key == 'y':
            api_key = input(f"   Paste your {provider_name} API key: ").strip()
            if api_key:
                with open('.env', 'a') as f:
                    f.write(f"\n{env_var}={api_key}\n")
                load_dotenv()
                print(f"   ✅ Added {provider_name} key to .env")
                return True
        return False

# Check each provider
print("📊 Checking available providers:\n")

# Google Gemini
if check_api_key("Google", "GOOGLE_API_KEY", "Gemini"):
    try:
        import google.generativeai as genai
        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
        providers_available.append(("gemini", "Google Gemini (gemini-1.5-flash)"))
    except ImportError:
        print("   ⚠️  Google AI library not installed. Run: pip install google-generativeai")

# OpenAI
if check_api_key("OpenAI", "OPENAI_API_KEY", "GPT-4"):
    try:
        import openai
        providers_available.append(("openai", "OpenAI (gpt-4o-mini)"))
    except ImportError:
        print("   ⚠️  OpenAI library not installed. Run: pip install openai")

# Anthropic
if check_api_key("Anthropic", "ANTHROPIC_API_KEY", "Claude"):
    try:
        import anthropic
        providers_available.append(("claude", "Anthropic Claude (claude-3-haiku)"))
    except ImportError:
        print("   ⚠️  Anthropic library not installed. Run: pip install anthropic")

# Select model
print(f"\n📋 Available models: {len(providers_available)}")

if len(providers_available) == 0:
    print("❌ No models available. Please configure at least one API key.")
    raise ValueError("No API keys configured")
elif len(providers_available) == 1:
    selected = providers_available[0]
    selected_model = selected[0]
    print(f"✅ Using: {selected[1]}")
else:
    print("\n🎯 Select your model:")
    for i, (key, name) in enumerate(providers_available):
        print(f"   {i+1}. {name}")
    
    choice = input(f"\nYour choice (1-{len(providers_available)}) [1]: ").strip() or "1"
    try:
        idx = int(choice) - 1
        selected = providers_available[idx]
        selected_model = selected[0]
        print(f"\n✅ Selected: {selected[1]}")
    except:
        selected = providers_available[0]
        selected_model = selected[0]
        print(f"\n✅ Defaulting to: {selected[1]}")

print("\n✨ Model selection complete!")

In [None]:
# Cell 3: Configuration Settings (New - matching analysis notebook)
class Config:
    """Central configuration for the queries workflow."""
    
    def __init__(self, model_provider):
        # API Settings
        self.TROPY_PROJECT_API = 'http://127.0.0.1:2019/project'
        self.TROPY_API_URL = 'http://127.0.0.1:2019'
        
        # Dynamic model configuration based on selection
        if model_provider == 'gemini':
            self.EMBEDDING_MODEL = 'models/embedding-001'
        elif model_provider == 'openai':
            self.EMBEDDING_MODEL = 'text-embedding-3-small'
        elif model_provider == 'claude':
            self.EMBEDDING_MODEL = None  # Claude doesn't have native embeddings
        
        # File paths
        self.EMBEDDINGS_FILE = f"tropy_embeddings_{model_provider}.json"
        self.ITEM_SUMMARIES_FILE = f"output/item_summaries_{model_provider}.json"
        
        # For backward compatibility with gemini
        if model_provider == 'gemini':
            # Also check for non-model-specific files
            if os.path.exists("tropy_embeddings.json"):
                self.EMBEDDINGS_FILE = "tropy_embeddings.json"
            if os.path.exists("output/item_summaries.json"):
                self.ITEM_SUMMARIES_FILE = "output/item_summaries.json"

config = Config(selected_model)
print(f"✅ Configuration initialized for {selected_model}")

In [None]:
# Cell 4: Model Adapters for Embeddings (New)
class EmbeddingAdapter:
    """Base adapter interface for embeddings"""
    def generate_query_embedding(self, query: str):
        raise NotImplementedError

class GeminiEmbeddingAdapter(EmbeddingAdapter):
    def __init__(self, config):
        self.config = config
        import google.generativeai as genai
        self.genai = genai
    
    def generate_query_embedding(self, query: str):
        try:
            result = self.genai.embed_content(
                model=self.config.EMBEDDING_MODEL, 
                content=query, 
                task_type="RETRIEVAL_QUERY"
            )
            return result['embedding']
        except Exception as e:
            logger.error(f"Error generating Gemini embedding: {e}")
            return None

class OpenAIEmbeddingAdapter(EmbeddingAdapter):
    def __init__(self, config):
        self.config = config
        from openai import OpenAI
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    
    def generate_query_embedding(self, query: str):
        try:
            response = self.client.embeddings.create(
                model=self.config.EMBEDDING_MODEL,
                input=query
            )
            return response.data[0].embedding
        except Exception as e:
            logger.error(f"Error generating OpenAI embedding: {e}")
            return None

class ClaudeEmbeddingAdapter(EmbeddingAdapter):
    def __init__(self, config):
        self.config = config
        # Claude doesn't have embeddings, so we'll use OpenAI if available
        if os.getenv("OPENAI_API_KEY"):
            from openai import OpenAI
            self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
            self.use_openai = True
        else:
            self.use_openai = False
            logger.warning("Claude doesn't provide embeddings and OpenAI key not available")
    
    def generate_query_embedding(self, query: str):
        if self.use_openai:
            try:
                response = self.client.embeddings.create(
                    model="text-embedding-3-small",
                    input=query
                )
                return response.data[0].embedding
            except Exception as e:
                logger.error(f"Error generating embedding: {e}")
                return None
        else:
            logger.error("No embedding model available for Claude")
            return None

# Factory function
def get_embedding_adapter(config, model_provider):
    if model_provider == 'gemini':
        return GeminiEmbeddingAdapter(config)
    elif model_provider == 'openai':
        return OpenAIEmbeddingAdapter(config)
    elif model_provider == 'claude':
        return ClaudeEmbeddingAdapter(config)
    else:
        return GeminiEmbeddingAdapter(config)

# Initialize adapter
embedding_adapter = get_embedding_adapter(config, selected_model)
print(f"✅ Embedding adapter configured for {selected_model}")

In [None]:
# Cell 5: Load Data and Build Search Index (Updated - Smart File Discovery)

# Function to find embeddings file
def find_embeddings_file():
    """Try to find any embeddings file, regardless of model."""
    possible_files = [
        # Model-specific files
        f"tropy_embeddings_{selected_model}.json",
        f"output/tropy_embeddings_{selected_model}.json",
        
        # Generic files
        "tropy_embeddings.json",
        "output/tropy_embeddings.json",
        
        # All possible model files
        "tropy_embeddings_gemini.json",
        "tropy_embeddings_openai.json", 
        "tropy_embeddings_claude.json",
        "output/tropy_embeddings_gemini.json",
        "output/tropy_embeddings_openai.json",
        "output/tropy_embeddings_claude.json",
    ]
    
    for file_path in possible_files:
        if os.path.exists(file_path):
            return file_path
    
    # If nothing found, scan current directory for any embeddings file
    for file in os.listdir('.'):
        if 'embeddings' in file and file.endswith('.json'):
            return file
    
    # Check output directory
    if os.path.exists('output'):
        for file in os.listdir('output'):
            if 'embeddings' in file and file.endswith('.json'):
                return os.path.join('output', file)
    
    return None

# Function to find summaries file
def find_summaries_file():
    """Try to find any summaries file."""
    possible_files = [
        # Model-specific files
        f"output/item_summaries_{selected_model}.json",
        f"item_summaries_{selected_model}.json",
        
        # Generic files
        "output/item_summaries.json",
        "item_summaries.json",
        
        # All possible model files
        "output/item_summaries_gemini.json",
        "output/item_summaries_openai.json",
        "output/item_summaries_claude.json",
        "item_summaries_gemini.json",
        "item_summaries_openai.json",
        "item_summaries_claude.json",
    ]
    
    for file_path in possible_files:
        if os.path.exists(file_path):
            return file_path
    
    # Scan directories
    for directory in ['.', 'output']:
        if os.path.exists(directory):
            for file in os.listdir(directory):
                if 'summaries' in file and file.endswith('.json'):
                    return os.path.join(directory, file) if directory != '.' else file
    
    return None

# Find actual files
embedding_file = find_embeddings_file()
summaries_file = find_summaries_file()

print("🔍 File Discovery Results:")
print(f"   Looking for embeddings with model: {selected_model}")
print(f"   Found embeddings: {embedding_file or 'NOT FOUND'}")
print(f"   Found summaries: {summaries_file or 'NOT FOUND'}")

# Load Embeddings
embeddings_records = []
embedding_matrix = None

if embedding_file:
    try:
        with open(embedding_file, 'r') as f:
            data = json.load(f)
        
        # Handle different file formats
        if isinstance(data, dict) and 'photo_embeddings' in data:
            embeddings_records = data['photo_embeddings']
        elif isinstance(data, list):
            embeddings_records = data
        else:
            raise ValueError("Invalid embeddings file format")
        
        print(f"\n✅ Successfully loaded {len(embeddings_records)} records from {embedding_file}")
        
        # Check embedding dimensions match selected model
        if embeddings_records:
            embedding_dim = len(embeddings_records[0]['embedding'])
            expected_dims = {
                'gemini': 768,  # Gemini embedding dimension
                'openai': 1536,  # OpenAI text-embedding-3-small dimension
                'claude': 1536   # Would use OpenAI fallback
            }
            
            if selected_model in expected_dims and embedding_dim != expected_dims[selected_model]:
                print(f"\n⚠️  Warning: Embedding dimensions ({embedding_dim}) don't match expected for {selected_model} ({expected_dims[selected_model]})")
                print(f"   This suggests embeddings were created with a different model.")
                print(f"   Search will still work, but consider regenerating embeddings with {selected_model} for optimal results.")
        
        # Build numpy matrix for similarity search
        embedding_matrix = np.array([record['embedding'] for record in embeddings_records]).astype('float32')
        print(f"✅ Created embedding matrix with shape: {embedding_matrix.shape}")
        print(f"📊 Using numpy-based similarity search (efficient for {len(embeddings_records)} documents)")
        
    except Exception as e:
        print(f"\n❌ ERROR loading embeddings file '{embedding_file}': {e}")
        embeddings_records = []
        embedding_matrix = None
else:
    print(f"\n❌ No embeddings file found!")
    print("\n📝 To fix this:")
    print("   1. Run the analysis notebook first to generate embeddings")
    print("   2. Make sure the embeddings file is in the current directory or 'output' folder")
    print("   3. The file should be named like 'tropy_embeddings.json' or 'tropy_embeddings_[model].json'")

# Load Item Summaries
item_summaries = []

if summaries_file:
    try:
        with open(summaries_file, 'r') as f:
            item_summaries = json.load(f)
        print(f"\n✅ Successfully loaded {len(item_summaries)} AI-generated item summaries.")
    except Exception as e:
        print(f"\n⚠️ Could not load summaries file: {e}")
        item_summaries = []
else:
    print(f"\n⚠️ No summaries file found. Metadata enhancement will be disabled.")

# Summary
if embeddings_records:
    print(f"\n🎉 Ready for analysis with {len(embeddings_records)} embeddings!")
    if 'gemini' in embedding_file.lower() and selected_model != 'gemini':
        print(f"\n💡 Note: You're using {selected_model} for queries but embeddings were created with Gemini.")
        print("   This works fine, but for consistency you might want to:")
        print(f"   - Either: Regenerate embeddings using {selected_model} in the analysis notebook")
        print("   - Or: Restart and select Gemini when running this notebook")
else:
    print("\n❌ Cannot proceed without embeddings. Please run the analysis notebook first!")

In [None]:
# Cell 6: Similarity Search Functions 

def numpy_similarity_search(query_vector, embedding_matrix, top_k=10):
    """
    Perform similarity search using numpy (efficient for small to medium collections).
    For collections over 10k documents, consider ChromaDB or FAISS.
    """
    # Normalize query vector
    query_norm = query_vector / np.linalg.norm(query_vector)
    
    # Normalize all embeddings (for cosine similarity)
    norms = np.linalg.norm(embedding_matrix, axis=1)
    normalized_embeddings = embedding_matrix / norms[:, np.newaxis]
    
    # Calculate similarities
    similarities = np.dot(normalized_embeddings, query_norm)
    
    # Get top k indices
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    # Convert similarities to distances (for compatibility)
    distances = 1 - similarities[top_indices]
    
    return distances, top_indices

print("✅ Similarity search functions ready")

In [None]:
# Cell 7: Enhanced Search with Better Previews and Selective Tagging

def search_and_tag_workflow():
    """Interactive search with longer previews and selective tagging."""
    query = input("\nEnter your search query: ").strip()
    if not query:
        return
    
    print(f"\n🔎 Searching for: '{query}'...")
    
    # Lower default threshold
    DEFAULT_THRESHOLD = 0.3
    
    # Generate query embedding
    query_embedding = embedding_adapter.generate_query_embedding(query)
    if query_embedding is None:
        print("❌ Failed to generate query embedding")
        return
    
    query_vector = np.array(query_embedding).astype('float32')
    
    # Search ALL documents
    distances, indices = numpy_similarity_search(query_vector, embedding_matrix, top_k=len(embeddings_records))
    similarities = 1 - distances
    
    # Show top similarities for debugging
    print(f"\n📊 Top 5 similarity scores:")
    for i in range(min(5, len(similarities))):
        print(f"   {i+1}. {similarities[i]:.3f} ({similarities[i]*100:.1f}%)")
    
    # Try different thresholds if needed
    threshold = DEFAULT_THRESHOLD
    relevant_mask = similarities >= threshold
    num_results = np.sum(relevant_mask)
    
    # If too few results, show what we have
    if num_results == 0:
        print(f"\n⚠️  No results above {threshold:.0%} threshold.")
        print("📉 Showing top 10 results regardless of threshold:\n")
        
        # Just take top 10
        relevant_indices = indices[:10]
        relevant_similarities = similarities[:10]
        threshold = 0
    else:
        print(f"\n✅ Found {num_results} results above {threshold:.0%} threshold")
        
        # Extract initial results based on threshold
        relevant_indices = indices[relevant_mask]
        relevant_similarities = similarities[relevant_mask]
        
        # Ask if user wants to adjust threshold
        print("\n🎚️  Threshold Options:")
        print(f"   1. Continue with current results ({num_results} items)")
        print("   2. Show more results (lower threshold)")
        print("   3. Show fewer results (higher threshold)")
        print("   4. Show top N results regardless of threshold")
        
        adjust = input("\nYour choice (1-4) [1]: ").strip() or "1"
        
        if adjust == "2":
            new_threshold = float(input(f"Enter lower threshold (0.0-{threshold:.1f}) [0.2]: ").strip() or "0.2")
            threshold = new_threshold
            relevant_mask = similarities >= threshold
            relevant_indices = indices[relevant_mask]
            relevant_similarities = similarities[relevant_mask]
        elif adjust == "3":
            new_threshold = float(input(f"Enter higher threshold ({threshold:.1f}-1.0) [0.5]: ").strip() or "0.5")
            threshold = new_threshold
            relevant_mask = similarities >= threshold
            relevant_indices = indices[relevant_mask]
            relevant_similarities = similarities[relevant_mask]
        elif adjust == "4":
            n = int(input("How many top results to show? [20]: ").strip() or "20")
            relevant_indices = indices[:n]
            relevant_similarities = similarities[:n]
            threshold = 0
        # If adjust == "1", we already have relevant_indices set above
    
    # Get results
    results = []
    for idx, sim in zip(relevant_indices, relevant_similarities):
        result = embeddings_records[idx].copy()
        result['similarity'] = sim
        results.append(result)
    
    if not results:
        print("\n❌ No results to display")
        return
    
    # Group by items
    item_to_photos = {}
    for result in results:
        item_id = result.get('item_id')
        item_title = result.get('item_title', 'Untitled')
        
        if item_id not in item_to_photos:
            item_to_photos[item_id] = {
                'title': item_title,
                'photos': [],
                'max_similarity': 0
            }
        
        item_to_photos[item_id]['photos'].append(result)
        item_to_photos[item_id]['max_similarity'] = max(
            item_to_photos[item_id]['max_similarity'], 
            result['similarity']
        )
    
    # Display results with longer previews
    print("\n" + "="*70)
    print("📊 SEARCH RESULTS")
    print("="*70)
    print(f"🔍 Query: '{query}'")
    print(f"📊 Threshold: {threshold:.0%} similarity")
    print(f"📷 Photos found: {len(results)}")
    print(f"📚 Unique items: {len(item_to_photos)}")
    print("="*70)
    
    # Store items in a list for selection
    items_list = list(item_to_photos.items())
    
    # Show items with longer previews
    print("\n📋 Items found:")
    for i, (item_id, data) in enumerate(items_list):
        # Color code by similarity
        sim = data['max_similarity']
        if sim >= 0.5:
            icon = "🟢"
        elif sim >= 0.3:
            icon = "🟡"
        else:
            icon = "🔴"
        
        print(f"\n{icon} {i+1}. {data['title']} (ID: {item_id})")
        print(f"      Best match: {sim:.1%} | Photos: {len(data['photos'])}")
        
        # Show longer preview (up to 400 chars)
        best_photo = max(data['photos'], key=lambda x: x['similarity'])
        summary = best_photo.get('summary', 'No summary')
        preview_length = min(len(summary), 400)  # Show up to 400 characters
        print(f"      Preview: \"{summary[:preview_length]}{'...' if len(summary) > preview_length else ''}\"")
    
    print("\n" + "-"*70)
    
    # View options
    print("\n📄 View Options:")
    print("   1. Continue to tagging")
    print("   2. Show full summaries for specific items")
    print("   3. Show all full summaries")
    print("   4. Cancel")
    
    view_choice = input("\nYour choice (1-4) [1]: ").strip() or "1"
    
    if view_choice == "2":
        item_nums = input("Enter item numbers to view (e.g., 1,3,5): ").strip()
        try:
            nums = [int(x.strip()) - 1 for x in item_nums.split(',')]
            print("\n" + "="*70)
            for num in nums:
                if 0 <= num < len(items_list):
                    item_id, data = items_list[num]
                    print(f"\n📄 FULL DETAILS - Item {num+1}: {data['title']}")
                    print("="*70)
                    for j, photo in enumerate(data['photos']):
                        print(f"\nPhoto {j+1} (Similarity: {photo['similarity']:.1%}):")
                        print(f"Full summary: {photo.get('summary', 'No summary')}")
                        print("-"*50)
            print("\n" + "="*70)
        except:
            print("Invalid input")
    
    elif view_choice == "3":
        print("\n" + "="*70)
        print("📄 ALL FULL SUMMARIES")
        print("="*70)
        for i, (item_id, data) in enumerate(items_list[:10]):  # Limit to first 10
            print(f"\n{i+1}. {data['title']}")
            best_photo = max(data['photos'], key=lambda x: x['similarity'])
            print(f"Full summary: {best_photo.get('summary', 'No summary')}")
            print("-"*70)
        if len(items_list) > 10:
            print(f"\n... showing first 10 of {len(items_list)} items")
    
    elif view_choice == "4":
        return
    
    # Tagging with selection options
    if len(items_list) == 0:
        print("No items to tag.")
        return
    
    print(f"\n🏷️  TAGGING OPTIONS")
    print(f"   Total items available: {len(items_list)}")
    print("\n   1. Tag ALL items")
    print("   2. Tag specific items by number")
    print("   3. Tag only high relevance items (≥50%)")
    print("   4. Skip tagging")
    
    tag_choice = input("\nYour choice (1-4) [1]: ").strip() or "1"
    
    if tag_choice == "4":
        return
    
    # Determine items to tag
    items_to_tag = []
    
    if tag_choice == "1":
        items_to_tag = [item_id for item_id, _ in items_list]
        print(f"\n✅ Will tag ALL {len(items_to_tag)} items")
        
    elif tag_choice == "2":
        # SELECT SPECIFIC ITEMS BY NUMBER
        item_nums = input("Enter item numbers to tag (e.g., 1,2,3,4): ").strip()
        try:
            selected_nums = [int(x.strip()) - 1 for x in item_nums.split(',')]
            items_to_tag = []
            selected_titles = []
            
            for num in selected_nums:
                if 0 <= num < len(items_list):
                    item_id, data = items_list[num]
                    items_to_tag.append(item_id)
                    selected_titles.append(f"{num+1}. {data['title']}")
            
            print(f"\n✅ Will tag {len(items_to_tag)} selected items:")
            for title in selected_titles[:5]:
                print(f"   {title}")
            if len(selected_titles) > 5:
                print(f"   ... and {len(selected_titles) - 5} more")
                
        except Exception as e:
            print(f"❌ Invalid input: {e}")
            return
            
    elif tag_choice == "3":
        items_to_tag = [item_id for item_id, data in items_list if data['max_similarity'] >= 0.5]
        print(f"\n✅ Will tag {len(items_to_tag)} high-relevance items")
    
    if not items_to_tag:
        print("❌ No items selected for tagging")
        return
    
    # Apply tag
    tag_name = input("\nEnter tag name: ").strip()
    if tag_name:
        add_tag_to_items(items_to_tag, tag_name)
    else:
        print("❌ No tag name provided")

print("✅ Enhanced search with selective tagging ready")

In [None]:
# Cell 8: API Client (New)
class TropyAPIClient:
    """Handles Tropy API interactions."""
    
    def __init__(self, config):
        self.config = config
        self.session = requests.Session()
    
    def fetch_all_tags(self):
        try:
            response = self.session.get(f"{self.config.TROPY_PROJECT_API}/tags")
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Failed to fetch tags: {e}")
            return []
    
    def create_tag(self, tag_name, color="#4A90E2"):
        try:
            payload = {'@type': 'Tag', 'title': tag_name, 'color': color}
            response = self.session.post(f"{self.config.TROPY_PROJECT_API}/tags", json=payload)
            response.raise_for_status()
            return response.json().get('id')
        except Exception as e:
            logger.error(f"Failed to create tag: {e}")
            return None
    
    def add_tag_to_item(self, item_id, tag_id):
        try:
            response = self.session.post(
                f"{self.config.TROPY_PROJECT_API}/items/{item_id}/tags", 
                json={"tag": tag_id}
            )
            return response.status_code in [200, 201, 204]
        except Exception as e:
            logger.error(f"Failed to add tag to item {item_id}: {e}")
            return False
    
    def update_item_title(self, item_id, new_title):
        try:
            payload = {"http://purl.org/dc/elements/1.1/title": new_title}
            response = self.session.post(
                f"{self.config.TROPY_PROJECT_API}/items/{item_id}/data", 
                json=payload
            )
            return response.status_code in [200, 201, 204]
        except Exception as e:
            logger.error(f"Failed to update item title: {e}")
            return False

# Initialize API client
api_client = TropyAPIClient(config)
print("✅ API client initialized")

In [None]:
# Cell 9: Fixed Tagging Function 

def add_tag_to_items(item_ids, tag_name, color="#4A90E2"):
    """Fixed version that properly handles Tropy API."""
    if not item_ids: 
        return
    
    unique_ids = list(set(item_ids))
    print(f"\n🏷️ Preparing to apply tag '{tag_name}' to {len(unique_ids)} unique items...")
    
    try:
        # First, let's check what tags endpoint returns
        test_response = api_client.session.get(f"{config.TROPY_PROJECT_API}/tags")
        
        if test_response.status_code == 404:
            print("⚠️  Tags endpoint not found. Trying alternative endpoints...")
            # Try without /project
            test_response = api_client.session.get(f"{config.TROPY_API_URL}/tags")
        
        existing_tags = []
        if test_response.status_code == 200:
            existing_tags = test_response.json()
        
        # Find existing tag
        tag_id = None
        for tag in existing_tags:
            # Check multiple possible field names
            tag_title = tag.get('title') or tag.get('name') or tag.get('value', '')
            if tag_title == tag_name:
                tag_id = tag.get('id')
                print(f"✅ Found existing tag '{tag_name}' (ID: {tag_id})")
                break
        
        # Create tag if needed
        if not tag_id:
            print(f"Creating new tag '{tag_name}'...")
            
            # Try multiple endpoint and payload combinations
            endpoints_to_try = [
                f"{config.TROPY_PROJECT_API}/tags",
                f"{config.TROPY_API_URL}/tags",
                f"{config.TROPY_API_URL}/project/tags"
            ]
            
            payloads_to_try = [
                {"name": tag_name, "color": color},
                {"title": tag_name, "color": color},
                {"value": tag_name, "color": color},
                {"tag": {"name": tag_name, "color": color}},
            ]
            
            created = False
            for endpoint in endpoints_to_try:
                if created:
                    break
                for payload in payloads_to_try:
                    try:
                        response = api_client.session.post(
                            endpoint,
                            json=payload,
                            headers={'Content-Type': 'application/json'}
                        )
                        
                        if response.status_code in [200, 201]:
                            result = response.json()
                            tag_id = result.get('id') or result.get('tag', {}).get('id')
                            if tag_id:
                                print(f"✅ Successfully created tag with ID: {tag_id}")
                                created = True
                                break
                    except Exception as e:
                        continue
            
            if not tag_id:
                # Last resort - try form data
                for endpoint in endpoints_to_try:
                    try:
                        response = api_client.session.post(
                            endpoint,
                            data={'name': tag_name, 'color': color}
                        )
                        if response.status_code in [200, 201]:
                            result = response.json()
                            tag_id = result.get('id')
                            if tag_id:
                                print(f"✅ Created tag using form data")
                                break
                    except:
                        continue
        
        if not tag_id:
            print(f"❌ Failed to create tag. Please create '{tag_name}' manually in Tropy.")
            return
        
        # Apply tag to items
        success_count = 0
        failed_items = []
        
        for item_id in tqdm(unique_ids, desc="Tagging Items"):
            tagged = False
            
            # Try different endpoints for tagging
            tag_endpoints = [
                f"{config.TROPY_PROJECT_API}/items/{item_id}/tags",
                f"{config.TROPY_API_URL}/items/{item_id}/tags",
                f"{config.TROPY_API_URL}/project/items/{item_id}/tags"
            ]
            
            for endpoint in tag_endpoints:
                if tagged:
                    break
                    
                # Try JSON payload
                try:
                    response = api_client.session.post(
                        endpoint,
                        json={"id": tag_id}  # Try with just id
                    )
                    if response.status_code in [200, 201, 204]:
                        success_count += 1
                        tagged = True
                        break
                except:
                    pass
                
                # Try with "tag" wrapper
                try:
                    response = api_client.session.post(
                        endpoint,
                        json={"tag": tag_id}
                    )
                    if response.status_code in [200, 201, 204]:
                        success_count += 1
                        tagged = True
                        break
                except:
                    pass
                
                # Try form data
                try:
                    response = api_client.session.post(
                        endpoint,
                        data={"tag": tag_id}
                    )
                    if response.status_code in [200, 201, 204]:
                        success_count += 1
                        tagged = True
                        break
                except:
                    pass
            
            if not tagged:
                failed_items.append(item_id)
        
        print(f"\n✅ Tagging complete!")
        print(f"   Successfully tagged: {success_count} items")
        if failed_items:
            print(f"   Failed to tag: {len(failed_items)} items")
            print(f"   Failed IDs: {failed_items[:5]}{'...' if len(failed_items) > 5 else ''}")
            
    except Exception as e:
        print(f"❌ Error during tagging: {e}")
        print("\n💡 Troubleshooting tips:")
        print("   1. Ensure Tropy is running with REST API enabled")
        print("   2. Check if port 2019 is correct")
        print("   3. Try creating the tag manually in Tropy first")

print("✅ Fixed tagging function ready")

In [None]:
# Cell 10: Intelligent Theme Discovery (Replace the original)

def discover_semantic_themes_intelligent(n_clusters):
    """
    Discover meaningful historical themes, not just document types.
    """
    print(f"\n🧠 Discovering {n_clusters} meaningful historical themes...")
    
    if embedding_matrix is None:
        print("❌ No embeddings loaded")
        return
    
    # Run clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(embedding_matrix)
    
    # Words to exclude from theme names (too generic)
    boring_words = {
        'archival', 'document', 'page', 'visible', 'text', 
        'official', 'formal', 'written',
        'manuscript', 'handwritten', 'typed', 'recorded', 'based',
        'sheet', 'folio', 'recto', 'verso', 'stamp', 'seal'
    }
    
    themes = []
    
    for i in range(n_clusters):
        cluster_indices = np.where(cluster_labels == i)[0]
        if len(cluster_indices) == 0: 
            continue
        
        # Get full summaries from this cluster
        cluster_summaries = []
        for idx in cluster_indices:
            if idx < len(embeddings_records) and embeddings_records[idx].get('summary'):
                cluster_summaries.append({
                    'summary': embeddings_records[idx]['summary'],
                    'item_id': embeddings_records[idx]['item_id'],
                    'item_title': embeddings_records[idx].get('item_title', 'Untitled')
                })
        
        if not cluster_summaries: 
            continue
        
        # Extract meaningful terms using enhanced TF-IDF
        try:
            # Get all text
            texts = [item['summary'] for item in cluster_summaries]
            
            # Use TF-IDF with custom settings
            vectorizer = TfidfVectorizer(
                max_features=20,  # Get more terms to work with
                stop_words='english',
                ngram_range=(1, 3),  # Include phrases up to 3 words
                min_df=2 if len(texts) > 5 else 1  # Term must appear in at least 2 docs
            )
            
            tfidf_matrix = vectorizer.fit_transform(texts)
            feature_names = vectorizer.get_feature_names_out()
            
            # Get term scores for this cluster
            scores = tfidf_matrix.sum(axis=0).A1
            top_indices = scores.argsort()[::-1]
            
            # Filter out boring terms and find meaningful ones
            meaningful_terms = []
            entities = []
            concepts = []
            
            for idx in top_indices:
                term = feature_names[idx].lower()
                
                # Skip boring terms
                if any(boring in term for boring in boring_words):
                    continue
                
                # Categorize terms
                if any(word in term for word in ['cardinal', 'bishop', 'pope', 'priest', 'sister', 'father', 'monsignor']):
                    entities.append(term.title())
                elif any(word in term for word in ['case', 'letter', 'request', 'petition', 'report', 'investigation']):
                    concepts.append(term)
                elif term[0].isupper() or any(char.isupper() for char in term[1:]):  # Likely a name or place
                    entities.append(term.title())
                elif len(term.split()) > 1:  # Multi-word phrases are often meaningful
                    concepts.append(term)
                else:
                    meaningful_terms.append(term)
                
                # Stop when we have enough
                if len(meaningful_terms) + len(entities) + len(concepts) >= 5:
                    break
            
            # Build theme name prioritizing entities and concepts
            theme_parts = []
            if entities:
                theme_parts.extend(entities[:2])
            if concepts:
                theme_parts.extend(concepts[:2])
            if len(theme_parts) < 3 and meaningful_terms:
                theme_parts.extend(meaningful_terms[:3-len(theme_parts)])
            
            if not theme_parts:  # Fallback
                theme_parts = [t for t in feature_names[:5] if not any(b in t.lower() for b in boring_words)][:3]
            
            theme_name = " + ".join(theme_parts[:3])
            
            # Analyze content for better description
            # Look for patterns in summaries
            dates = []
            locations = []
            topics = []
            
            for summary in texts[:10]:  # Analyze first 10 summaries
                # Extract years (1900-2099)
                import re
                years = re.findall(r'\b(19\d{2}|20\d{2})\b', summary)
                dates.extend(years)
                
                # Look for locations (capitalized words not at sentence start)
                words = summary.split()
                for i, word in enumerate(words[1:], 1):
                    if word[0].isupper() and word not in boring_words:
                        if words[i-1][-1] not in '.!?':  # Not start of sentence
                            locations.append(word)
            
            # Create meaningful description
            date_range = ""
            if dates:
                date_range = f" ({min(dates)}-{max(dates)})" if len(set(dates)) > 1 else f" ({dates[0]})"
            
            # Get unique item IDs
            item_ids = list(set([item['item_id'] for item in cluster_summaries]))
            
            themes.append({
                'id': len(themes) + 1,
                'name': theme_name + date_range,
                'photo_count': len(cluster_indices),
                'item_count': len(item_ids),
                'item_ids': item_ids,
                'summaries': cluster_summaries[:5],  # Keep full summary objects
                'key_entities': entities[:3],
                'key_concepts': concepts[:3],
                'date_range': date_range
            })
            
        except Exception as e:
            print(f"Error processing cluster {i}: {e}")
            continue
    
    if not themes:
        print("❌ No themes could be extracted.")
        return
    
    # Display themes with rich information
    print(f"\n📊 Discovered {len(themes)} Historical Themes:\n")
    print("=" * 100)
    
    for theme in themes:
        print(f"\n🏷️  THEME {theme['id']}: {theme['name']}")
        print(f"   📚 Scope: {theme['photo_count']} photos across {theme['item_count']} items")
        
        if theme['key_entities']:
            print(f"   👥 Key People/Places: {', '.join(theme['key_entities'])}")
        if theme['key_concepts']:
            print(f"   📋 Key Topics: {', '.join(theme['key_concepts'])}")
        
        print(f"\n   📖 Representative Documents:")
        for j, item in enumerate(theme['summaries'][:3], 1):
            print(f"\n   [{j}] Item: {item['item_title']}")
            # Show meaningful excerpt (skip generic opening)
            summary = item['summary']
            # Try to find the most interesting part
            sentences = summary.split('. ')
            interesting_part = summary
            for sentence in sentences:
                if any(term in sentence.lower() for term in theme['key_entities'] + theme['key_concepts']):
                    interesting_part = sentence
                    break
            print(f"       \"{interesting_part[:200]}...\"")
        
        print("\n" + "-" * 100)
    
    # Interactive selection
    print("\n🎯 Select themes to tag (more meaningful tags will be created):")
    print("   • Enter theme numbers (e.g., 1,3,5)")
    print("   • Type 'all' for all themes")
    print("   • Type 'details X' to see more about theme X")
    print("   • Press Enter to skip")
    
    while True:
        choice = input("\nYour selection: ").strip().lower()
        
        if choice == '':
            print("Skipping tagging.")
            return
        
        if choice.startswith('details '):
            try:
                theme_num = int(choice.split()[1])
                theme = next((t for t in themes if t['id'] == theme_num), None)
                if theme:
                    print(f"\n📚 Full details for Theme {theme_num}:")
                    for j, item in enumerate(theme['summaries'], 1):
                        print(f"\n[{j}] {item['item_title']}")
                        print(f"    {item['summary']}\n")
                else:
                    print("Theme not found.")
            except:
                print("Use format: details 3")
            continue
        
        break
    
    # Parse selection and create meaningful tags
    if choice == 'all':
        selected_themes = themes
    else:
        try:
            selected_indices = [int(x.strip()) for x in choice.split(',')]
            selected_themes = [t for t in themes if t['id'] in selected_indices]
        except:
            print("❌ Invalid selection.")
            return
    
    # Apply meaningful tags
    print(f"\n🏷️  Creating {len(selected_themes)} meaningful tags...")
    
    for theme in selected_themes:
        # Create a more specific tag name
        if theme['key_entities'] and theme['key_concepts']:
            tag_name = f"{theme['key_concepts'][0].title()}: {theme['key_entities'][0]}{theme['date_range']}"
        elif theme['key_concepts']:
            tag_name = f"{theme['key_concepts'][0].title()}{theme['date_range']}"
        else:
            tag_name = f"Theme: {theme['name']}"
        
        print(f"\n   Creating tag: {tag_name}")
        add_tag_to_items_fixed(theme['item_ids'], tag_name)

print("✅ Intelligent clustering ready!")

In [None]:
# Cell 11: Fixed Metadata Enhancement Function (Corrected Version)

def enhance_item_titles():
    """Update alternative titles (dcterms:alternative) with AI suggestions using correct Tropy API."""
    if not item_summaries: 
        print("❌ No item summaries loaded to enhance titles.")
        return
    
    print(f"✍️ Preparing to update machine-generated titles for {len(item_summaries)} items...")
    print("   This will populate the 'dcterms:alternative' field (Machine-generated title)")
    
    if input("\nContinue? (y/n): ").lower() != 'y':
        print("Cancelled.")
        return
    
    success_count = 0
    failed_count = 0
    skipped_count = 0
    
    # First, let's verify we can reach the API
    try:
        test_response = api_client.session.get(f"{config.TROPY_PROJECT_API}/items")
        if test_response.status_code != 200:
            print(f"❌ Cannot connect to Tropy API. Status: {test_response.status_code}")
            return
    except Exception as e:
        print(f"❌ API connection failed: {e}")
        return
    
    for summary_data in tqdm(item_summaries, desc="Updating Alternative Titles"):
        suggested_title = summary_data.get('suggested_title')
        
        if not suggested_title:
            skipped_count += 1
            continue
            
        item_id = summary_data['item_id']
        
        # Correct payload format as specified by developer
        payload = {
            "http://purl.org/dc/terms/alternative": suggested_title
        }
        
        try:
            # Use the correct endpoint: /project/data/{id}
            response = api_client.session.post(
                f"{config.TROPY_API_URL}/project/data/{item_id}", 
                json=payload
            )
            
            if response.status_code in [200, 201, 204]:
                success_count += 1
            else:
                # Log error details for debugging
                if response.status_code == 404:
                    print(f"  ⚠️  Item {item_id} not found in project")
                else:
                    print(f"  ❌ Failed for item {item_id}: Status {response.status_code}")
                    if response.text:
                        print(f"     Response: {response.text[:200]}")
                failed_count += 1
                    
        except Exception as e:
            logger.error(f"Failed to update item {item_id}: {e}")
            failed_count += 1
    
    print(f"\n✅ Title enhancement complete!")
    print(f"   Successfully updated: {success_count} items")
    print(f"   Skipped (no suggestion): {skipped_count} items")
    if failed_count > 0:
        print(f"   Failed: {failed_count} items")
    
    print(f"\n💡 Check your Tropy items - the 'Machine-generated title' field should now be populated!")

print("✅ Metadata enhancement function ready")

# Improved function to check item fields and validate structure
def check_item_fields():
    """Check what fields are available in Tropy items and validate API connectivity."""
    print("\n🔍 Checking available fields in Tropy items...")
    
    try:
        # Get items list
        response = api_client.session.get(f"{config.TROPY_PROJECT_API}/items")
        
        if response.status_code != 200:
            print(f"❌ Failed to get items. Status: {response.status_code}")
            print(f"Response: {response.text}")
            return
            
        items = response.json()
        
        if not items or len(items) == 0:
            print("❌ No items found in the project")
            return
            
        first_item_id = items[0].get('id')
        print(f"📋 Checking structure of item ID: {first_item_id}")
        
        # Check current metadata using the data endpoint
        data_response = api_client.session.get(f"{config.TROPY_API_URL}/project/data/{first_item_id}")
        
        if data_response.status_code == 200:
            current_data = data_response.json()
            print(f"\n📋 Current metadata for item {first_item_id}:")
            
            if current_data:
                for field, value in current_data.items():
                    short_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
                    print(f"   - {field}: {short_value}")
                    
                # Check for dcterms:alternative specifically
                if 'http://purl.org/dc/terms/alternative' in current_data:
                    print("\n✅ Found dcterms:alternative field!")
                else:
                    print("\n⚠️  dcterms:alternative field not found in current metadata.")
                    print("   This might be normal if no alternative title has been set yet.")
            else:
                print("   No metadata found for this item")
        else:
            print(f"⚠️  Could not get metadata. Status: {data_response.status_code}")
            print("   This is normal if the item has no metadata yet.")
                
    except Exception as e:
        print(f"❌ Error checking fields: {e}")

# Additional helper function to test a single item update
def test_single_item_update():
    """Test updating a single item to debug the process."""
    if not item_summaries:
        print("❌ No item summaries loaded.")
        return
    
    # Get first item with a suggested title
    test_item = None
    for summary in item_summaries:
        if summary.get('suggested_title'):
            test_item = summary
            break
    
    if not test_item:
        print("❌ No items with suggested titles found.")
        return
    
    item_id = test_item['item_id']
    suggested_title = test_item['suggested_title']
    
    print(f"🧪 Testing update for item {item_id}")
    print(f"   Suggested title: '{suggested_title}'")
    
    payload = {
        "http://purl.org/dc/terms/alternative": suggested_title
    }
    
    try:
        # First, check current data
        current_response = api_client.session.get(f"{config.TROPY_API_URL}/project/data/{item_id}")
        print(f"\n   Current data status: {current_response.status_code}")
        
        if current_response.status_code == 200:
            current_data = current_response.json()
            if current_data:
                print("   Current metadata fields:")
                for field in current_data.keys():
                    print(f"     - {field}")
        
        # Now try to update
        print(f"\n   Attempting update to endpoint: {config.TROPY_API_URL}/project/data/{item_id}")
        print(f"   Payload: {payload}")
        
        response = api_client.session.post(
            f"{config.TROPY_API_URL}/project/data/{item_id}", 
            json=payload,
            headers={'Content-Type': 'application/json'}
        )
        
        print(f"\n   Update response status: {response.status_code}")
        if response.text:
            print(f"   Update response text: {response.text[:200]}")
        
        if response.status_code in [200, 201, 204]:
            print("\n✅ Test update successful!")
            
            # Verify the update worked
            verify_response = api_client.session.get(f"{config.TROPY_API_URL}/project/data/{item_id}")
            if verify_response.status_code == 200:
                updated_data = verify_response.json()
                if 'http://purl.org/dc/terms/alternative' in updated_data:
                    print(f"   ✅ Confirmed: Alternative title is now: '{updated_data['http://purl.org/dc/terms/alternative']}'")
                else:
                    print("   ⚠️  Alternative title field not found after update")
            
        else:
            print(f"\n❌ Test update failed!")
            print("   Troubleshooting tips:")
            print("   1. Check if the item ID exists in Tropy")
            print("   2. Ensure Tropy is running with REST API enabled")
            print("   3. Verify the dcterms:alternative field is in your template")
            
    except Exception as e:
        print(f"❌ Test failed with exception: {e}")

# Batch update with better error handling
def batch_update_titles(batch_size=10):
    """Update titles in batches with detailed error reporting."""
    if not item_summaries:
        print("❌ No item summaries loaded.")
        return
    
    items_with_titles = [s for s in item_summaries if s.get('suggested_title')]
    
    if not items_with_titles:
        print("❌ No items with suggested titles found.")
        return
    
    print(f"📊 Found {len(items_with_titles)} items with suggested titles")
    
    total_batches = (len(items_with_titles) + batch_size - 1) // batch_size
    
    success_count = 0
    failed_items = []
    
    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, len(items_with_titles))
        batch = items_with_titles[start_idx:end_idx]
        
        print(f"\n📦 Processing batch {batch_num + 1}/{total_batches} ({len(batch)} items)...")
        
        for item in batch:
            item_id = item['item_id']
            suggested_title = item['suggested_title']
            
            payload = {
                "http://purl.org/dc/terms/alternative": suggested_title
            }
            
            try:
                response = api_client.session.post(
                    f"{config.TROPY_API_URL}/project/data/{item_id}", 
                    json=payload
                )
                
                if response.status_code in [200, 201, 204]:
                    success_count += 1
                    print(f"   ✅ {item_id}")
                else:
                    failed_items.append({
                        'id': item_id,
                        'status': response.status_code,
                        'error': response.text[:100] if response.text else 'No error message'
                    })
                    print(f"   ❌ {item_id} (Status: {response.status_code})")
                    
            except Exception as e:
                failed_items.append({
                    'id': item_id,
                    'error': str(e)
                })
                print(f"   ❌ {item_id} (Exception: {str(e)[:50]}...)")
        
        # Small delay between batches
        if batch_num < total_batches - 1:
            time.sleep(0.5)
    
    print(f"\n📊 Final Results:")
    print(f"   ✅ Success: {success_count}/{len(items_with_titles)}")
    print(f"   ❌ Failed: {len(failed_items)}")
    
    if failed_items:
        print("\n❌ Failed items details:")
        for item in failed_items[:5]:  # Show first 5
            print(f"   - ID: {item['id']}")
            if 'status' in item:
                print(f"     Status: {item['status']}")
            print(f"     Error: {item.get('error', 'Unknown error')}")
        
        if len(failed_items) > 5:
            print(f"   ... and {len(failed_items) - 5} more")

print("✅ Enhanced metadata functions ready")
print("\n💡 Available functions:")
print("   check_item_fields()      - Verify API connectivity and field availability")
print("   test_single_item_update() - Test with one item first")
print("   enhance_item_titles()     - Update all titles")
print("   batch_update_titles(20)   - Update in batches with size control")

In [None]:
# Cell 12: Interactive Main Menu (Complete version)
def main_menu():
    """Enhanced main menu with better search integration."""
    while True:
        print("\n" + "="*50)
        print("   TROPY AI ANALYSIS & TAGGING WORKBENCH")
        print(f"   Model: {selected_model.upper()}")
        print("="*50)
        print("1. 🔎 Search & Tag (Ask a question)")
        print("2. 🧠 Discover Semantic Themes (Clustering)")
        print("3. ✍️ Enhance Metadata with AI Titles")
        print("4. 🚪 Exit")
        
        choice = input("\nPlease select an option (1-4): ").strip()
        
        if choice == '1':
            search_and_tag_workflow()
        
        elif choice == '2':
            try:
                num_clusters = int(input("How many themes to discover? (e.g., 8-15 is a good start): ").strip())
                if num_clusters > 1:
                    if 'discover_semantic_themes_intelligent' in globals():
                        discover_semantic_themes_intelligent(num_clusters) # doesn't exist! ST
                    else:
                        discover_semantic_themes(num_clusters)
            except ValueError:
                print("Invalid number. Please enter an integer.")

        elif choice == '3':
            enhance_item_titles()
            
        elif choice == '4':
            print("Exiting. Goodbye!")
            break
        else:
            print("Invalid choice. Please try again.")

# Display collection info
if embeddings_records:
    print(f"\n📊 Collection size: {len(embeddings_records)} photos")
    print("✨ Ready for analysis!")
    
    # Quick stats about the collection
    unique_items = len(set(r.get('item_id') for r in embeddings_records))
    print(f"📚 Unique items: {unique_items}")
    
    # Check if summaries exist
    summaries_count = sum(1 for r in embeddings_records if r.get('summary'))
    print(f"📝 Photos with summaries: {summaries_count}/{len(embeddings_records)}")

# Run the main menu
main_menu()