# AIC Video Retrieval System - Data Processing & Indexing

This notebook handles dataset download, video processing, frame extraction, and vector indexing.
It can run independently on any cloud platform after running the setup notebook.

## Features
- 📥 Download AIC dataset with progress tracking
- 🎥 Intelligent video frame sampling
- 🖼️ CLIP-based image encoding
- 🔍 FAISS vector index construction
- 💾 Optimized storage and retrieval

In [None]:
# Import and setup
import os
import sys
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output
import time

# Set up paths (assuming setup notebook was run)
REPO_NAME = "AIC_FTML_dev"
if Path(f"/content/{REPO_NAME}").exists():
    REPO_DIR = Path(f"/content/{REPO_NAME}")
else:
    REPO_DIR = Path.cwd()
    while REPO_DIR.name != REPO_NAME and REPO_DIR.parent != REPO_DIR:
        REPO_DIR = REPO_DIR.parent

os.chdir(REPO_DIR)
sys.path.insert(0, str(REPO_DIR))
sys.path.insert(0, str(REPO_DIR / "src"))

print(f"Working from: {REPO_DIR}")

# Import project modules
import config
from src.models.clip_encoder import CLIPEncoder
from src.indexing.vector_index import VectorIndex
from src.sampling.frames_auto import VideoFrameSampler

## Step 1: Dataset Download

In [None]:
# Configuration
DOWNLOAD_SUBSET = True  # Set to False to download full dataset
SUBSET_SIZE = 50  # Number of videos for subset
DATA_DIR = Path("./data")
DATASET_DIR = DATA_DIR / "aic2025"
METADATA_DIR = DATA_DIR / "dataset_metadata"

# Create directories
DATA_DIR.mkdir(exist_ok=True)
DATASET_DIR.mkdir(exist_ok=True)
METADATA_DIR.mkdir(exist_ok=True)

print(f"Dataset configuration:")
print(f"  Download subset: {DOWNLOAD_SUBSET}")
print(f"  Subset size: {SUBSET_SIZE if DOWNLOAD_SUBSET else 'Full dataset'}")
print(f"  Dataset directory: {DATASET_DIR}")
print(f"  Metadata directory: {METADATA_DIR}")

In [None]:
# Download dataset metadata and links
import requests
import urllib.parse

def download_with_progress(url, filename):
    """Download file with progress bar"""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filename, 'wb') as f, tqdm(
        desc=filename.name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))
    return filename

# Check if we have the dataset links CSV
csv_file = Path("AIC_2025_dataset_download_link.csv")
if not csv_file.exists():
    print("❌ Dataset links CSV not found")
    print("Please ensure AIC_2025_dataset_download_link.csv is available")
else:
    print("✅ Dataset links CSV found")
    
    # Read dataset links
    df_links = pd.read_csv(csv_file)
    print(f"Found {len(df_links)} videos in dataset")
    
    if DOWNLOAD_SUBSET:
        df_links = df_links.head(SUBSET_SIZE)
        print(f"Processing subset of {len(df_links)} videos")
    
    display(df_links.head())

In [None]:
# Download dataset using the built-in downloader
print("=== Starting Dataset Download ===")

if csv_file.exists():
    # Run the dataset downloader
    download_script = Path("scripts/dataset_downloader.py")
    if download_script.exists():
        cmd = f"python {download_script} --output_dir {DATASET_DIR}"
        if DOWNLOAD_SUBSET:
            cmd += f" --max_videos {SUBSET_SIZE}"
        
        print(f"Running: {cmd}")
        !{cmd}
    else:
        print("⚠️ Dataset downloader script not found, using manual download...")
        
        # Manual download fallback
        for idx, row in tqdm(df_links.iterrows(), total=len(df_links), desc="Downloading videos"):
            video_id = row['video_id']
            video_url = row['video_url']
            
            video_file = DATASET_DIR / f"{video_id}.mp4"
            if not video_file.exists():
                try:
                    download_with_progress(video_url, video_file)
                except Exception as e:
                    print(f"Failed to download {video_id}: {e}")
                    continue
else:
    print("⚠️ Skipping download - CSV file not found")
    print("Please place AIC_2025_dataset_download_link.csv in the repo root")

# Check what we have
video_files = list(DATASET_DIR.glob("*.mp4"))
print(f"\n✅ Downloaded {len(video_files)} video files")

## Step 2: Frame Extraction and Sampling

In [None]:
# Initialize frame sampler
print("=== Frame Extraction Setup ===")

# Configuration
FRAMES_DIR = Path("./keyframes")
FRAMES_DIR.mkdir(exist_ok=True)

# Sampling parameters
SAMPLING_MODE = "intelligent"  # "uniform", "intelligent", or "adaptive"
FRAMES_PER_VIDEO = 10  # Number of frames to extract per video
MIN_INTERVAL = 2.0  # Minimum seconds between frames

print(f"Frame extraction configuration:")
print(f"  Output directory: {FRAMES_DIR}")
print(f"  Sampling mode: {SAMPLING_MODE}")
print(f"  Frames per video: {FRAMES_PER_VIDEO}")
print(f"  Min interval: {MIN_INTERVAL}s")

# Initialize sampler
try:
    sampler = VideoFrameSampler(
        output_dir=FRAMES_DIR,
        sampling_mode=SAMPLING_MODE,
        frames_per_video=FRAMES_PER_VIDEO,
        min_interval=MIN_INTERVAL
    )
    print("✅ Frame sampler initialized")
except Exception as e:
    print(f"⚠️ Could not initialize frame sampler: {e}")
    print("Using fallback frame extraction...")
    sampler = None

In [None]:
# Extract frames from videos
print("=== Frame Extraction ===")

video_files = list(DATASET_DIR.glob("*.mp4"))
if not video_files:
    print("❌ No video files found. Please run the download step first.")
else:
    print(f"Processing {len(video_files)} videos...")
    
    frame_metadata = []  # Track frame information
    
    for video_file in tqdm(video_files, desc="Extracting frames"):
        video_id = video_file.stem
        
        try:
            if sampler:
                # Use intelligent sampler
                frames_info = sampler.sample_video(video_file, video_id)
            else:
                # Fallback: extract frames uniformly
                import cv2
                
                cap = cv2.VideoCapture(str(video_file))
                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                fps = cap.get(cv2.CAP_PROP_FPS)
                duration = total_frames / fps if fps > 0 else 0
                
                frames_info = []
                if total_frames > 0:
                    step = max(1, total_frames // FRAMES_PER_VIDEO)
                    
                    for i in range(0, total_frames, step):
                        if len(frames_info) >= FRAMES_PER_VIDEO:
                            break
                            
                        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
                        ret, frame = cap.read()
                        if ret:
                            # Save frame
                            frame_filename = FRAMES_DIR / f"{video_id}_frame_{i:06d}.jpg"
                            cv2.imwrite(str(frame_filename), frame)
                            
                            frames_info.append({
                                'video_id': video_id,
                                'frame_idx': i,
                                'timestamp': i / fps if fps > 0 else 0,
                                'frame_path': str(frame_filename)
                            })
                
                cap.release()
            
            frame_metadata.extend(frames_info)
            
        except Exception as e:
            print(f"Error processing {video_id}: {e}")
            continue
    
    print(f"\n✅ Extracted {len(frame_metadata)} frames total")
    
    # Save frame metadata
    if frame_metadata:
        metadata_df = pd.DataFrame(frame_metadata)
        metadata_file = METADATA_DIR / "frame_metadata.parquet"
        metadata_df.to_parquet(metadata_file, index=False)
        print(f"✅ Saved frame metadata to {metadata_file}")
        
        # Show statistics
        print(f"\nFrame extraction statistics:")
        print(f"  Total frames: {len(metadata_df)}")
        print(f"  Videos processed: {metadata_df['video_id'].nunique()}")
        print(f"  Avg frames per video: {len(metadata_df) / metadata_df['video_id'].nunique():.1f}")
        
        display(metadata_df.head())

## Step 3: CLIP Encoding

In [None]:
# Initialize CLIP encoder
print("=== CLIP Encoder Setup ===")

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load CLIP encoder
try:
    encoder = CLIPEncoder(
        model_name=config.MODEL_NAME,
        device=device
    )
    print(f"✅ CLIP encoder loaded: {config.MODEL_NAME}")
except Exception as e:
    print(f"Error loading CLIP encoder: {e}")
    # Fallback to direct CLIP loading
    from sentence_transformers import SentenceTransformer
    encoder = SentenceTransformer('clip-ViT-B-32')
    print("✅ Fallback CLIP encoder loaded")

print(f"Encoding configuration:")
print(f"  Batch size: {getattr(config, 'BATCH_SIZE', 32)}")
print(f"  Device: {device}")

In [None]:
# Encode all extracted frames
print("=== Frame Encoding ===")

# Load frame metadata
metadata_file = METADATA_DIR / "frame_metadata.parquet"
if not metadata_file.exists():
    print("❌ Frame metadata not found. Please run frame extraction first.")
else:
    metadata_df = pd.read_parquet(metadata_file)
    print(f"Loading {len(metadata_df)} frames for encoding...")
    
    # Batch encoding for efficiency
    BATCH_SIZE = getattr(config, 'BATCH_SIZE', 32)
    all_embeddings = []
    valid_frames = []  # Track which frames were successfully encoded
    
    for start_idx in tqdm(range(0, len(metadata_df), BATCH_SIZE), desc="Encoding frames"):
        end_idx = min(start_idx + BATCH_SIZE, len(metadata_df))
        batch_df = metadata_df.iloc[start_idx:end_idx]
        
        # Load batch of images
        batch_images = []
        batch_valid_indices = []
        
        for idx, row in batch_df.iterrows():
            frame_path = Path(row['frame_path'])
            if frame_path.exists():
                try:
                    from PIL import Image
                    img = Image.open(frame_path).convert('RGB')
                    batch_images.append(img)
                    batch_valid_indices.append(idx)
                except Exception as e:
                    print(f"Error loading {frame_path}: {e}")
                    continue
        
        if batch_images:
            try:
                # Encode batch
                if hasattr(encoder, 'encode_images'):
                    batch_embeddings = encoder.encode_images(batch_images)
                else:
                    # Fallback for sentence-transformers
                    batch_embeddings = encoder.encode(batch_images)
                
                all_embeddings.extend(batch_embeddings)
                valid_frames.extend(batch_valid_indices)
                
            except Exception as e:
                print(f"Error encoding batch: {e}")
                continue
    
    if all_embeddings:
        # Convert to numpy array
        embeddings_array = np.array(all_embeddings)
        print(f"\n✅ Encoded {len(embeddings_array)} frames")
        print(f"Embedding shape: {embeddings_array.shape}")
        
        # Filter metadata to only valid frames
        valid_metadata_df = metadata_df.iloc[valid_frames].reset_index(drop=True)
        
        # Save embeddings and metadata
        embeddings_file = METADATA_DIR / "frame_embeddings.npy"
        np.save(embeddings_file, embeddings_array)
        
        valid_metadata_file = METADATA_DIR / "valid_frame_metadata.parquet"
        valid_metadata_df.to_parquet(valid_metadata_file, index=False)
        
        print(f"✅ Saved embeddings to {embeddings_file}")
        print(f"✅ Saved valid metadata to {valid_metadata_file}")
    else:
        print("❌ No embeddings generated")

## Step 4: Build Vector Index

In [None]:
# Build FAISS index
print("=== Vector Index Construction ===")

# Load embeddings
embeddings_file = METADATA_DIR / "frame_embeddings.npy"
valid_metadata_file = METADATA_DIR / "valid_frame_metadata.parquet"

if not embeddings_file.exists():
    print("❌ Embeddings not found. Please run encoding first.")
else:
    embeddings = np.load(embeddings_file)
    metadata_df = pd.read_parquet(valid_metadata_file)
    
    print(f"Building index for {len(embeddings)} embeddings...")
    print(f"Embedding dimension: {embeddings.shape[1]}")
    
    # Normalize embeddings for cosine similarity
    from sklearn.preprocessing import normalize
    embeddings_normalized = normalize(embeddings, norm='l2')
    
    # Build FAISS index
    import faiss
    
    # Choose index type based on dataset size
    if len(embeddings) < 10000:
        # Use flat index for small datasets
        index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner product for normalized vectors
        print("Using IndexFlatIP for small dataset")
    else:
        # Use HNSW for larger datasets
        index = faiss.IndexHNSWFlat(embeddings.shape[1], 32)
        index.hnsw.efConstruction = 200
        print("Using IndexHNSWFlat for larger dataset")
    
    # Add embeddings to index
    index.add(embeddings_normalized.astype(np.float32))
    
    print(f"✅ Index built with {index.ntotal} vectors")
    
    # Save index and metadata to artifacts directory
    ARTIFACT_DIR = Path(config.ARTIFACT_DIR)
    ARTIFACT_DIR.mkdir(exist_ok=True)
    
    index_file = ARTIFACT_DIR / "vector_index.faiss"
    metadata_file = ARTIFACT_DIR / "index_metadata.parquet"
    
    faiss.write_index(index, str(index_file))
    metadata_df.to_parquet(metadata_file, index=False)
    
    print(f"✅ Saved index to {index_file}")
    print(f"✅ Saved metadata to {metadata_file}")

## Step 5: Index Validation & Statistics

In [None]:
# Validate the built index
print("=== Index Validation ===")

ARTIFACT_DIR = Path(config.ARTIFACT_DIR)
index_file = ARTIFACT_DIR / "vector_index.faiss"
metadata_file = ARTIFACT_DIR / "index_metadata.parquet"

if index_file.exists() and metadata_file.exists():
    # Load index and metadata
    index = faiss.read_index(str(index_file))
    metadata_df = pd.read_parquet(metadata_file)
    
    print(f"Index validation:")
    print(f"  Index size: {index.ntotal} vectors")
    print(f"  Metadata size: {len(metadata_df)} entries")
    print(f"  Dimension: {index.d}")
    print(f"  Index type: {type(index).__name__}")
    
    # Test search functionality
    print("\nTesting search functionality...")
    query_vector = np.random.randn(1, index.d).astype(np.float32)
    query_vector = query_vector / np.linalg.norm(query_vector)  # Normalize
    
    k = min(5, index.ntotal)
    distances, indices = index.search(query_vector, k)
    
    print(f"✅ Search test successful - returned {len(indices[0])} results")
    print(f"Sample distances: {distances[0][:3]}")
    
    # Show statistics
    print("\n=== Dataset Statistics ===")
    print(f"Total frames indexed: {len(metadata_df)}")
    print(f"Unique videos: {metadata_df['video_id'].nunique()}")
    print(f"Avg frames per video: {len(metadata_df) / metadata_df['video_id'].nunique():.1f}")
    
    # Video distribution
    video_counts = metadata_df['video_id'].value_counts()
    print(f"\nFrames per video distribution:")
    print(f"  Min: {video_counts.min()}")
    print(f"  Max: {video_counts.max()}")
    print(f"  Mean: {video_counts.mean():.1f}")
    print(f"  Median: {video_counts.median():.1f}")
    
    # Show sample entries
    print("\nSample index entries:")
    display(metadata_df.head())
    
else:
    print("❌ Index files not found. Please run the indexing step.")

## Step 6: Create Training Data (Optional)

In [None]:
# Generate training data for reranking
print("=== Training Data Generation ===")

CREATE_TRAINING_DATA = True  # Set to True to generate training data

if CREATE_TRAINING_DATA:
    try:
        from utils.create_training_data import create_training_data_from_metadata
        
        success = create_training_data_from_metadata(
            dataset_root=str(DATASET_DIR.parent),
            output_file="data/train.jsonl",
            num_examples=50
        )
        
        if success:
            print("✅ Training data generated successfully")
            
            # Show sample training data
            training_file = Path("data/train.jsonl")
            if training_file.exists():
                with open(training_file, 'r') as f:
                    sample_lines = [next(f) for _ in range(3)]
                
                print("\nSample training examples:")
                for i, line in enumerate(sample_lines, 1):
                    data = json.loads(line)
                    print(f"{i}. Query: '{data['query']}'")
                    print(f"   Positives: {len(data['positives'])} frames")
        else:
            print("⚠️ Training data generation failed")
            
    except ImportError:
        print("⚠️ Training data utility not available")
else:
    print("⚠️ Training data generation skipped")

print("\n" + "="*50)
print("🎉 DATA PROCESSING COMPLETE!")
print("="*50)
print("\nNext steps:")
print("1. Use 03_search_and_evaluation.ipynb to test search")
print("2. Use 04_training_and_reranking.ipynb to improve results")
print("3. Use 05_end_to_end_pipeline.ipynb for complete workflow")

## Summary & Artifacts

This notebook has processed your video dataset and created the following artifacts:

### Generated Files:
- `artifacts/vector_index.faiss` - FAISS vector index for similarity search
- `artifacts/index_metadata.parquet` - Frame metadata with video_id, frame_idx, timestamps
- `data/dataset_metadata/frame_embeddings.npy` - CLIP embeddings for all frames
- `data/train.jsonl` - Training data for reranker (if generated)
- `keyframes/` - Extracted video frames

### Index Statistics:
The index is now ready for similarity search using CLIP embeddings. You can search using text queries or image queries.