# AIC Video Retrieval System - Search & Evaluation

This notebook provides an interactive interface for searching the video index and evaluating results.
It works with the index built in the previous notebook and provides various search modes.

## Features
- 🔍 Text-based similarity search
- 🖼️ Image-based similarity search  
- 🔀 Hybrid search combining multiple modes
- 📊 Interactive result visualization
- 📈 Performance evaluation metrics
- 💾 Export search results

In [None]:
# Import and setup
import os
import sys
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, Image as IPImage, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import time

# Set up paths (assuming setup notebook was run)
REPO_NAME = "AIC_FTML_dev"
if Path(f"/content/{REPO_NAME}").exists():
    REPO_DIR = Path(f"/content/{REPO_NAME}")
else:
    REPO_DIR = Path.cwd()
    while REPO_DIR.name != REPO_NAME and REPO_DIR.parent != REPO_DIR:
        REPO_DIR = REPO_DIR.parent

os.chdir(REPO_DIR)
sys.path.insert(0, str(REPO_DIR))
sys.path.insert(0, str(REPO_DIR / "src"))

print(f"Working from: {REPO_DIR}")

# Import project modules
import config
from src.models.clip_encoder import CLIPEncoder
from src.indexing.vector_index import VectorIndex
from src.pipeline.query_pipeline import QueryProcessingPipeline

## Step 1: Load Index and Initialize Search

In [None]:
# Initialize search system
print("=== Search System Initialization ===")

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Check if index exists
ARTIFACT_DIR = Path(config.ARTIFACT_DIR)
index_file = ARTIFACT_DIR / "vector_index.faiss"
metadata_file = ARTIFACT_DIR / "index_metadata.parquet"

if not index_file.exists() or not metadata_file.exists():
    print("❌ Index not found. Please run the data processing notebook first.")
    print(f"Looking for:")
    print(f"  Index: {index_file}")
    print(f"  Metadata: {metadata_file}")
else:
    print("✅ Index files found")
    
    # Initialize query pipeline
    try:
        query_pipeline = QueryProcessingPipeline(
            artifact_dir=ARTIFACT_DIR,
            model_name=config.MODEL_NAME,
            device=device,
            enable_reranking=False  # Start without reranking
        )
        print("✅ Query pipeline initialized")
        
        # Load metadata for display
        metadata_df = pd.read_parquet(metadata_file)
        print(f"✅ Loaded metadata for {len(metadata_df)} frames")
        print(f"   Videos: {metadata_df['video_id'].nunique()}")
        
    except Exception as e:
        print(f"Error initializing pipeline: {e}")
        print("Falling back to basic search...")
        query_pipeline = None

## Step 2: Basic Search Interface

In [None]:
# Simple search function
def perform_search(query, search_mode="hybrid", k=20, expand_query=False):
    """Perform search and return results"""
    if query_pipeline is None:
        return [], "Query pipeline not available"
    
    try:
        start_time = time.time()
        
        results = query_pipeline.search(
            query=query,
            search_mode=search_mode,
            k=k,
            expand_query=expand_query
        )
        
        search_time = time.time() - start_time
        return results, f"Found {len(results)} results in {search_time:.3f}s"
        
    except Exception as e:
        return [], f"Search error: {e}"

def display_results(results, query, max_display=10):
    """Display search results with images if available"""
    if not results:
        print("No results found")
        return
    
    print(f"\n🔍 Search Results for: '{query}'")
    print("=" * 60)
    
    # Create results dataframe
    results_data = []
    for i, result in enumerate(results[:max_display]):
        results_data.append({
            'Rank': i + 1,
            'Video ID': result.video_id,
            'Frame': result.frame_idx,
            'Score': f"{result.score:.4f}",
            'Search Type': result.metadata.get('search_type', 'unknown')
        })
    
    results_df = pd.DataFrame(results_data)
    display(results_df)
    
    # Try to display images if frame paths exist
    keyframes_dir = Path("./keyframes")
    if keyframes_dir.exists():
        print("\n📸 Sample Result Images:")
        images_shown = 0
        
        for result in results[:5]:  # Show first 5 images
            # Try to find the frame image
            possible_paths = [
                keyframes_dir / f"{result.video_id}_frame_{result.frame_idx:06d}.jpg",
                keyframes_dir / f"{result.video_id}" / f"frame_{result.frame_idx:06d}.jpg",
                keyframes_dir / f"{result.video_id}_frame_{result.frame_idx}.jpg"
            ]
            
            for img_path in possible_paths:
                if img_path.exists():
                    try:
                        display(HTML(f"<h4>Rank {images_shown + 1}: {result.video_id} - Frame {result.frame_idx} (Score: {result.score:.3f})</h4>"))
                        display(IPImage(filename=str(img_path), width=300))
                        images_shown += 1
                        break
                    except:
                        continue
        
        if images_shown == 0:
            print("⚠️ No frame images found for display")

# Test search function
if query_pipeline:
    print("\n🧪 Testing search functionality...")
    test_results, status = perform_search("news anchor", k=5)
    print(status)
    if test_results:
        print(f"✅ Search working - sample result: {test_results[0].video_id}")
else:
    print("⚠️ Search functionality not available")

## Step 3: Interactive Search Widget

In [None]:
# Create interactive search widget
if query_pipeline:
    print("🎛️ Interactive Search Interface")
    
    # Search parameters
    query_widget = widgets.Text(
        value='news anchor speaking',
        placeholder='Enter your search query...',
        description='Query:',
        disabled=False,
        style={'description_width': 'initial'}
    )
    
    search_mode_widget = widgets.Dropdown(
        options=[('Hybrid', 'hybrid'), ('Vector Only', 'vector'), ('Text Only', 'text')],
        value='hybrid',
        description='Search Mode:',
        style={'description_width': 'initial'}
    )
    
    k_widget = widgets.IntSlider(
        value=20,
        min=5,
        max=100,
        step=5,
        description='Results:',
        style={'description_width': 'initial'}
    )
    
    expand_query_widget = widgets.Checkbox(
        value=False,
        description='Expand Query',
        style={'description_width': 'initial'}
    )
    
    max_display_widget = widgets.IntSlider(
        value=10,
        min=5,
        max=50,
        step=5,
        description='Max Display:',
        style={'description_width': 'initial'}
    )
    
    # Interactive search function
    def interactive_search(query, search_mode, k, expand_query, max_display):
        if not query.strip():
            print("Please enter a search query")
            return
        
        print(f"🔍 Searching for: '{query}'")
        print(f"Mode: {search_mode}, Results: {k}, Expand: {expand_query}")
        print("-" * 50)
        
        results, status = perform_search(query, search_mode, k, expand_query)
        print(status)
        
        if results:
            display_results(results, query, max_display)
    
    # Create interactive widget
    search_widget = interactive(
        interactive_search,
        query=query_widget,
        search_mode=search_mode_widget,
        k=k_widget,
        expand_query=expand_query_widget,
        max_display=max_display_widget
    )
    
    display(search_widget)
    
else:
    print("⚠️ Interactive search not available - pipeline not initialized")

## Step 4: Predefined Query Examples

In [None]:
# Predefined example queries
example_queries = {
    "News & Media": [
        "news anchor speaking",
        "television broadcast",
        "reporter on camera",
        "live news show",
        "studio presentation"
    ],
    "Vietnamese Content": [
        "tin tức mới nhất",
        "bản tin hôm nay",
        "thời sự việt nam",
        "HTV tin tức",
        "báo cáo thông tin"
    ],
    "General Content": [
        "person presenting",
        "people talking",
        "professional broadcast",
        "communication show",
        "information program"
    ],
    "Visual Elements": [
        "person wearing glasses",
        "formal attire",
        "microphone visible",
        "indoor studio setting",
        "text overlay on screen"
    ]
}

def run_example_queries():
    """Run example queries and show results"""
    if not query_pipeline:
        print("❌ Query pipeline not available")
        return
    
    print("🎯 Running Example Queries")
    print("=" * 50)
    
    all_results = []
    
    for category, queries in example_queries.items():
        print(f"\n📂 {category}:")
        
        for query in queries[:2]:  # Run first 2 queries per category
            print(f"\n🔍 '{query}'")
            results, status = perform_search(query, k=5)
            print(f"   {status}")
            
            if results:
                # Show top result
                top_result = results[0]
                print(f"   Top result: {top_result.video_id} frame {top_result.frame_idx} (score: {top_result.score:.3f})")
                
                # Store for analysis
                all_results.append({
                    'category': category,
                    'query': query,
                    'num_results': len(results),
                    'top_score': top_result.score,
                    'top_video': top_result.video_id
                })
            else:
                print("   No results found")
                all_results.append({
                    'category': category,
                    'query': query,
                    'num_results': 0,
                    'top_score': 0.0,
                    'top_video': None
                })
    
    # Create summary
    if all_results:
        results_df = pd.DataFrame(all_results)
        
        print("\n📊 Example Query Results Summary:")
        display(results_df)
        
        # Basic statistics
        print(f"\nStatistics:")
        print(f"  Average results per query: {results_df['num_results'].mean():.1f}")
        print(f"  Average top score: {results_df['top_score'].mean():.3f}")
        print(f"  Queries with results: {(results_df['num_results'] > 0).sum()}/{len(results_df)}")
        
        return results_df
    
    return None

# Run examples button
if query_pipeline:
    run_examples_button = widgets.Button(
        description="Run Example Queries",
        button_style='info',
        tooltip='Run predefined example queries'
    )
    
    def on_run_examples_clicked(b):
        with output:
            clear_output()
            run_example_queries()
    
    run_examples_button.on_click(on_run_examples_clicked)
    output = widgets.Output()
    
    display(run_examples_button)
    display(output)
else:
    print("📋 Example queries available but pipeline not initialized")
    for category, queries in example_queries.items():
        print(f"\n{category}:")
        for query in queries:
            print(f"  - {query}")

## Step 5: Search Performance Analysis

In [None]:
# Performance benchmarking
def benchmark_search_performance():
    """Benchmark search performance across different modes"""
    if not query_pipeline:
        print("❌ Query pipeline not available for benchmarking")
        return
    
    print("⚡ Search Performance Benchmark")
    print("=" * 40)
    
    test_queries = [
        "news anchor",
        "person speaking", 
        "television broadcast",
        "tin tức",
        "studio presentation"
    ]
    
    search_modes = ['vector', 'hybrid']
    k_values = [10, 50, 100]
    
    benchmark_results = []
    
    for mode in search_modes:
        for k in k_values:
            print(f"\nTesting {mode} search with k={k}")
            
            times = []
            result_counts = []
            
            for query in test_queries:
                start_time = time.time()
                results, _ = perform_search(query, search_mode=mode, k=k)
                search_time = time.time() - start_time
                
                times.append(search_time)
                result_counts.append(len(results))
            
            avg_time = np.mean(times)
            avg_results = np.mean(result_counts)
            
            benchmark_results.append({
                'search_mode': mode,
                'k': k,
                'avg_time_ms': avg_time * 1000,
                'avg_results': avg_results,
                'queries_per_second': len(test_queries) / sum(times)
            })
            
            print(f"  Average time: {avg_time*1000:.1f}ms")
            print(f"  Average results: {avg_results:.1f}")
            print(f"  QPS: {len(test_queries) / sum(times):.1f}")
    
    # Create benchmark summary
    benchmark_df = pd.DataFrame(benchmark_results)
    
    print("\n📊 Performance Summary:")
    display(benchmark_df)
    
    # Visualize performance
    if len(benchmark_df) > 0:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        
        # Response time by mode and k
        sns.barplot(data=benchmark_df, x='k', y='avg_time_ms', hue='search_mode', ax=ax1)
        ax1.set_title('Average Response Time by Search Mode')
        ax1.set_ylabel('Response Time (ms)')
        
        # Queries per second
        sns.barplot(data=benchmark_df, x='k', y='queries_per_second', hue='search_mode', ax=ax2)
        ax2.set_title('Queries Per Second by Search Mode')
        ax2.set_ylabel('QPS')
        
        plt.tight_layout()
        plt.show()
    
    return benchmark_df

# Run benchmark button
if query_pipeline:
    benchmark_button = widgets.Button(
        description="Run Performance Benchmark",
        button_style='warning',
        tooltip='Benchmark search performance'
    )
    
    def on_benchmark_clicked(b):
        with benchmark_output:
            clear_output()
            benchmark_search_performance()
    
    benchmark_button.on_click(on_benchmark_clicked)
    benchmark_output = widgets.Output()
    
    display(benchmark_button)
    display(benchmark_output)
else:
    print("⚠️ Performance benchmark not available - pipeline not initialized")

## Step 6: Export Search Results

In [None]:
# Export functionality
def export_search_results(query, results, output_format="csv"):
    """Export search results to file"""
    if not results:
        print("No results to export")
        return None
    
    # Create results dataframe
    export_data = []
    for i, result in enumerate(results):
        export_data.append({
            'rank': i + 1,
            'video_id': result.video_id,
            'frame_idx': result.frame_idx,
            'score': result.score,
            'search_type': result.metadata.get('search_type', 'unknown'),
            'query': query,
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
        })
    
    export_df = pd.DataFrame(export_data)
    
    # Generate filename
    safe_query = "".join(c if c.isalnum() or c in (' ', '-', '_') else '' for c in query)
    safe_query = safe_query.replace(' ', '_')[:30]
    timestamp = time.strftime('%Y%m%d_%H%M%S')
    
    output_dir = Path("./output")
    output_dir.mkdir(exist_ok=True)
    
    if output_format.lower() == "csv":
        filename = output_dir / f"search_results_{safe_query}_{timestamp}.csv"
        export_df.to_csv(filename, index=False)
    elif output_format.lower() == "json":
        filename = output_dir / f"search_results_{safe_query}_{timestamp}.json"
        export_df.to_json(filename, orient='records', indent=2)
    else:
        filename = output_dir / f"search_results_{safe_query}_{timestamp}.parquet"
        export_df.to_parquet(filename, index=False)
    
    print(f"✅ Exported {len(results)} results to {filename}")
    return filename

# Export widget
if query_pipeline:
    print("💾 Export Search Results")
    
    export_query_widget = widgets.Text(
        value='news anchor',
        placeholder='Query to search and export...',
        description='Query:',
        style={'description_width': 'initial'}
    )
    
    export_format_widget = widgets.Dropdown(
        options=['csv', 'json', 'parquet'],
        value='csv',
        description='Format:',
        style={'description_width': 'initial'}
    )
    
    export_k_widget = widgets.IntSlider(
        value=100,
        min=10,
        max=500,
        step=10,
        description='Results:',
        style={'description_width': 'initial'}
    )
    
    def do_export(query, format_type, k):
        if not query.strip():
            print("Please enter a query")
            return
        
        print(f"🔍 Searching for '{query}' (k={k})...")
        results, status = perform_search(query, k=k)
        print(status)
        
        if results:
            export_search_results(query, results, format_type)
        else:
            print("❌ No results to export")
    
    export_widget = interactive(
        do_export,
        query=export_query_widget,
        format_type=export_format_widget,
        k=export_k_widget
    )
    
    display(export_widget)
else:
    print("⚠️ Export functionality not available - pipeline not initialized")

## Step 7: Search Quality Analysis

In [None]:
# Analyze search quality and diversity
def analyze_search_quality():
    """Analyze search result quality and diversity"""
    if not query_pipeline:
        print("❌ Query pipeline not available for analysis")
        return
    
    print("🔬 Search Quality Analysis")
    print("=" * 40)
    
    # Test diverse queries
    analysis_queries = [
        "news anchor speaking",
        "person wearing glasses",
        "studio setting",
        "microphone visible",
        "formal attire",
        "tin tức việt nam",
        "television broadcast",
        "reporter interview"
    ]
    
    analysis_results = []
    
    for query in analysis_queries:
        print(f"\nAnalyzing: '{query}'")
        
        results, _ = perform_search(query, k=50)
        
        if results:
            # Calculate diversity metrics
            unique_videos = len(set(r.video_id for r in results))
            video_diversity = unique_videos / len(results)
            
            # Score distribution
            scores = [r.score for r in results]
            score_std = np.std(scores) if len(scores) > 1 else 0
            score_range = max(scores) - min(scores) if scores else 0
            
            # Video distribution
            video_counts = {}
            for result in results:
                video_counts[result.video_id] = video_counts.get(result.video_id, 0) + 1
            
            max_frames_per_video = max(video_counts.values()) if video_counts else 0
            avg_frames_per_video = np.mean(list(video_counts.values())) if video_counts else 0
            
            analysis_results.append({
                'query': query,
                'total_results': len(results),
                'unique_videos': unique_videos,
                'video_diversity': video_diversity,
                'score_std': score_std,
                'score_range': score_range,
                'top_score': scores[0] if scores else 0,
                'bottom_score': scores[-1] if scores else 0,
                'max_frames_per_video': max_frames_per_video,
                'avg_frames_per_video': avg_frames_per_video
            })
            
            print(f"  Results: {len(results)}, Unique videos: {unique_videos} ({video_diversity:.2%})")
            print(f"  Score range: {scores[0]:.3f} - {scores[-1]:.3f} (std: {score_std:.3f})")
            print(f"  Max frames per video: {max_frames_per_video}")
        else:
            print(f"  No results found")
    
    if analysis_results:
        analysis_df = pd.DataFrame(analysis_results)
        
        print("\n📊 Quality Analysis Summary:")
        display(analysis_df.round(3))
        
        # Summary statistics
        print(f"\nOverall Statistics:")
        print(f"  Average video diversity: {analysis_df['video_diversity'].mean():.2%}")
        print(f"  Average score std: {analysis_df['score_std'].mean():.3f}")
        print(f"  Average max frames per video: {analysis_df['max_frames_per_video'].mean():.1f}")
        
        # Visualizations
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        
        # Video diversity
        axes[0,0].bar(range(len(analysis_df)), analysis_df['video_diversity'])
        axes[0,0].set_title('Video Diversity by Query')
        axes[0,0].set_ylabel('Diversity (unique_videos/total_results)')
        axes[0,0].set_xticks(range(len(analysis_df)))
        axes[0,0].set_xticklabels(analysis_df['query'], rotation=45, ha='right')
        
        # Score distribution
        axes[0,1].scatter(analysis_df['top_score'], analysis_df['score_std'])
        axes[0,1].set_title('Score Distribution')
        axes[0,1].set_xlabel('Top Score')
        axes[0,1].set_ylabel('Score Standard Deviation')
        
        # Frames per video distribution
        axes[1,0].bar(range(len(analysis_df)), analysis_df['max_frames_per_video'])
        axes[1,0].set_title('Max Frames per Video')
        axes[1,0].set_ylabel('Max Frames')
        axes[1,0].set_xticks(range(len(analysis_df)))
        axes[1,0].set_xticklabels(analysis_df['query'], rotation=45, ha='right')
        
        # Results count
        axes[1,1].bar(range(len(analysis_df)), analysis_df['total_results'])
        axes[1,1].set_title('Total Results per Query')
        axes[1,1].set_ylabel('Result Count')
        axes[1,1].set_xticks(range(len(analysis_df)))
        axes[1,1].set_xticklabels(analysis_df['query'], rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        
        return analysis_df
    
    return None

# Analysis button
if query_pipeline:
    analysis_button = widgets.Button(
        description="Analyze Search Quality",
        button_style='success',
        tooltip='Analyze search result quality and diversity'
    )
    
    def on_analysis_clicked(b):
        with analysis_output:
            clear_output()
            analyze_search_quality()
    
    analysis_button.on_click(on_analysis_clicked)
    analysis_output = widgets.Output()
    
    display(analysis_button)
    display(analysis_output)
else:
    print("⚠️ Quality analysis not available - pipeline not initialized")

print("\n" + "="*50)
print("🎉 SEARCH & EVALUATION READY!")
print("="*50)
print("\nNext steps:")
print("1. Use 04_training_and_reranking.ipynb to improve search results")
print("2. Use 05_end_to_end_pipeline.ipynb for complete workflow")

## Summary & Usage Guide

This notebook provides comprehensive search and evaluation capabilities:

### Available Search Modes:
- **Vector Search**: Pure similarity search using CLIP embeddings
- **Hybrid Search**: Combines multiple search strategies (recommended)
- **Text Search**: Text-based matching (if available)

### Interactive Features:
- 🎛️ **Interactive Search**: Real-time query interface with parameter controls
- 🎯 **Example Queries**: Predefined queries for different content types
- ⚡ **Performance Benchmark**: Speed and efficiency analysis
- 💾 **Export Results**: Save search results in multiple formats
- 🔬 **Quality Analysis**: Analyze result diversity and score distributions

### Tips for Better Results:
1. Use specific, descriptive queries
2. Try both English and Vietnamese queries if applicable
3. Use hybrid search mode for best results
4. Experiment with query expansion for broader results