## System Check & Setup

Run this cell first to check your Colab environment and system capabilities.

# AIC 2024/2025 Retrieval – Automated Google Colab Pipeline

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nqvu-daniel/AIC_FTML_dev/blob/main/notebooks/colab_pipeline.ipynb)

**Quick Start:**
1. **Enable GPU**: Runtime → Change runtime type → Hardware accelerator: T4/L4/A100 (recommended)
2. **Run Setup**: Execute the "Setup" cell below to automatically clone repo and install dependencies
3. **Choose Your Path**:
   - Host Inference (recommended): Use pre-built artifacts to run queries instantly
   - Development Pipeline: Build your own artifacts from scratch (requires dataset)

**File Downloads**: Results are saved to `/content/AIC_FTML_dev/submissions/` - you can download them from Colab's file browser.

---

## Two Usage Modes

### 1. Host Inference (Recommended - Fast)
- No dataset required
- Uses pre-built artifacts and models
- Ready in ~2 minutes
- Perfect for running queries and getting CSV results

### 2. Development Pipeline (Advanced - Slow)
- Downloads full dataset (alot~GBs)
- Builds search index from scratch + custom reranker models

---

In [None]:
!nvidia-smi || true
!python --version
import sys, os, pathlib
print('CWD:', os.getcwd())

In [None]:
# Setup: Clone repo and install dependencies automatically (GPU-ready)
import os
import pathlib
import subprocess
import sys

REPO_URL = 'https://github.com/nqvu-daniel/AIC_FTML_dev.git'
REPO_NAME = 'AIC_FTML_dev'

def setup_repository():
    """Automatically clone repository and setup environment"""
    try:
        # Check if repo already exists
        if pathlib.Path(REPO_NAME).exists():
            print(f"Repository '{REPO_NAME}' already exists")
            os.chdir(REPO_NAME)
        else:
            print(f"Cloning repository from {REPO_URL}")
            subprocess.run(['git', 'clone', REPO_URL], check=True)
            os.chdir(REPO_NAME)
            print("Repository cloned successfully")
        
        # Install dependencies
        print("Installing dependencies...")
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'], check=True)
        
        # Install FAISS based on CUDA availability
        try:
            import torch
            if torch.cuda.is_available():
                print("GPU detected, installing faiss-gpu-cu12...")
                subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'faiss-gpu-cu12'], check=True)
                print('Installed faiss-gpu-cu12 (CUDA 12 compatible)')
            else:
                print("No GPU detected, installing faiss-cpu...")
                subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'faiss-cpu'], check=True)
                print('Installed faiss-cpu')
        except Exception as e:
            print(f'FAISS install error: {e}')
            # Fallback to CPU version
            subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'faiss-cpu'], check=True)
            print('Fallback: Installed faiss-cpu')
        
        # Add to Python path
        if '.' not in sys.path:
            sys.path.append('.')
            
        print("Setup complete! Ready to run AIC FTML pipeline")
        print(f"Current directory: {os.getcwd()}")
        
        return True
        
    except subprocess.CalledProcessError as e:
        print(f"Error during setup: {e}")
        return False
    except Exception as e:
        print(f"Unexpected error: {e}")
        return False

# Run setup
if setup_repository():
    print("\nYou can now proceed with the pipeline!")
else:
    print("\nSetup failed. Please check the errors above.")

## Host Inference – One-shot
Provide `ARTIFACTS_BUNDLE_URL` and/or `RERANKER_MODEL_URL` if not already present in `./artifacts`.
This writes a Top-100 CSV into `submissions/`.

In [None]:
# Host Inference - Automated Setup and Query Execution
import os
import subprocess
import pathlib

# Configuration - Update these URLs with your hosted models
QUERY = 'a person opening a laptop'  # Change this to your search query
QUERY_ID = 'q1'  # Official query id for filename submissions/{query_id}.csv
TASK = 'kis'     # 'kis' or 'vqa'
ANSWER = ''      # Required if TASK='vqa'
ARTIFACTS_BUNDLE_URL = ''  # e.g., 'https://your-host.com/artifacts_bundle.tar.gz'
RERANKER_MODEL_URL = ''    # e.g., 'https://your-host.com/reranker.joblib'

def run_inference_query(query, bundle_url='', model_url='', query_id='', task='kis', answer=''):
    """Run inference with automatic artifact download if needed"""
    try:
        # Ensure we're in the right directory
        if not pathlib.Path('src/retrieval/use.py').exists():
            print('Missing use.py script. Make sure setup completed successfully.')
            return False
            
        # Build command
        cmd = ['python', 'src/retrieval/use.py', '--query', query, '--task', task]
        if query_id:
            cmd.extend(['--query_id', query_id])
        if task == 'vqa':
            if not answer:
                print('For TASK=vqa you must set ANSWER.')
                return False
            cmd.extend(['--answer', answer])
            
        if bundle_url:
            cmd.extend(['--bundle_url', bundle_url])
            print(f'Will download artifacts bundle from: {bundle_url}')
            
        if model_url:
            cmd.extend(['--model_url', model_url])
            print(f'Will download reranker model from: {model_url}')
        
        # Create submissions directory if it doesn't exist
        os.makedirs('submissions', exist_ok=True)
        
        print(f"Running query: '{query}' (task={task}, qid={query_id})")
        print('Command:', ' '.join(cmd))
        
        # Execute the command
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print('Query execution successful!')
            if result.stdout:
                print('Output:\n' + result.stdout)
            
            # List generated files
            submissions_dir = pathlib.Path('submissions')
            if submissions_dir.exists():
                csv_files = list(submissions_dir.glob('*.csv'))
                if csv_files:
                    print('\nGenerated ' + str(len(csv_files)) + ' result file(s):')
                    for csv_file in csv_files:
                        print(f'  - {csv_file}')
                        # Show first few lines of the CSV
                        try:
                            with open(csv_file, 'r') as f:
                                lines = f.readlines()[:5]
                                print('    Preview (first 5 lines):')
                                for i, line in enumerate(lines, 1):
                                    print(f'    {i}: {line.strip()}')
                        except Exception as e:
                            print(f'    (Could not preview: {e})')
            return True
        else:
            print('Query execution failed!')
            print('Error output:\n' + result.stderr)
            return False
            
    except Exception as e:
        print(f'Error running inference: {e}')
        return False

# Run the inference
print('Starting AIC FTML Host Inference...')
success = run_inference_query(QUERY, ARTIFACTS_BUNDLE_URL, RERANKER_MODEL_URL, QUERY_ID, TASK, ANSWER)

if success:
    print('\nInference completed! Check the submissions/ folder for results.')
else:
    print('\nInference failed. Check the error messages above.')

## Dev Pipeline – Build Artifacts (Optional)
Downloads dataset archives using `AIC_2025_dataset_download_link.csv`, builds index/corpus, optionally trains reranker, and assembles `my_pipeline/`.

In [None]:
# Dev Pipeline - Automated Dataset Download and Processing
import os
import subprocess
import pathlib
import time
import csv
import tempfile

# Configuration
DATASET_ROOT = '/content/aic2025'
# TEST_MODE = True  # Uncomment to enable test mode (only downloads L21-L24)
VIDEOS = ['L21', 'L22', 'L23', 'L24', 'L25', 'L26', 'L27', 'L28', 'L29', 'L30']  # adjust if needed
CSV_FILE = 'AIC_2025_dataset_download_link.csv'  # Update path if different

# Apply test mode if enabled
try:
    if TEST_MODE:
        VIDEOS = ['L21', 'L22', 'L23', 'L24']
        print("TEST MODE ENABLED: Only downloading L21-L24")
except NameError:
    pass  # TEST_MODE not defined, use full video list

def filter_csv_for_videos(csv_path, video_list, output_path):
    """Filter the CSV file to only include entries for specified videos"""
    if not pathlib.Path(csv_path).exists():
        return False
    
    filtered_rows = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader, None)
        if header:
            filtered_rows.append(header)
        
        for row in reader:
            if not row:
                continue
            # Check if any of our target videos appear in the filename
            filename = row[-2].strip() if len(row) >= 2 else ""
            should_include = any(vid in filename.upper() for vid in video_list)
            if should_include:
                filtered_rows.append(row)
    
    # Write filtered CSV
    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(filtered_rows)
    
    print(f"Filtered CSV: {len(filtered_rows)-1} entries for videos {video_list}")
    return True

def run_dev_pipeline():
    """Run the complete development pipeline with progress tracking"""
    steps = [
        "Download dataset",
        "Build search index", 
        "Build text corpus"
    ]
    
    # Check if GPU is available for flat indexing
    use_gpu = False
    try:
        import torch
        use_gpu = torch.cuda.is_available()
        if use_gpu:
            print("GPU detected - will build flat index for GPU acceleration")
        else:
            print("No GPU detected - building HNSW index for CPU")
    except Exception:
        print("Could not detect GPU - building HNSW index for CPU")
    
    try:
        # Step 1: Download dataset
        print(f"Step 1/3: {steps[0]}")
        if pathlib.Path(CSV_FILE).exists():
            # Create filtered CSV for our target videos
            with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_csv:
                filtered_csv_path = tmp_csv.name
            
            if filter_csv_for_videos(CSV_FILE, VIDEOS, filtered_csv_path):
                cmd = [
                    'python', 'scripts/dataset_downloader.py', 
                    '--dataset_root', DATASET_ROOT,
                    '--csv', filtered_csv_path,
                    '--skip-existing'
                ]
                print(f"Command: {' '.join(cmd)}")
                result = subprocess.run(cmd, capture_output=True, text=True)
                
                # Clean up temp file
                try:
                    os.unlink(filtered_csv_path)
                except:
                    pass
                
                if result.returncode != 0:
                    print(f"Dataset download failed: {result.stderr}")
                    return False
                print("Dataset download completed")
            else:
                print("Failed to filter CSV file")
                return False
        else:
            print(f"CSV file {CSV_FILE} not found. Skipping dataset download.")
        
        # Step 2: Build index
        print(f"\nStep 2/3: {steps[1]}")
        cmd = [
            'python', 'scripts/index.py',
            '--dataset_root', DATASET_ROOT,
            '--videos'
        ] + VIDEOS
        
        # Add --flat flag for GPU compatibility
        if use_gpu:
            cmd.append('--flat')
            print("Building flat index for GPU acceleration")
        
        print(f"Command: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Index building failed: {result.stderr}")
            return False
        print("Search index built successfully")
        
        # Step 3: Build text corpus
        print(f"\nStep 3/3: {steps[2]}")
        cmd = [
            'python', 'scripts/build_text.py',
            '--dataset_root', DATASET_ROOT,
            '--videos'
        ] + VIDEOS
        print(f"Command: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Text corpus building failed: {result.stderr}")
            return False
        print("Text corpus built successfully")
        
        # Check artifacts
        artifacts_dir = pathlib.Path('./artifacts')
        if artifacts_dir.exists():
            artifact_files = list(artifacts_dir.glob('*'))
            print(f"\nGenerated {len(artifact_files)} artifact files:")
            for artifact in artifact_files[:10]:  # Show first 10
                size = artifact.stat().st_size if artifact.is_file() else 0
                print(f"  - {artifact.name} ({size:,} bytes)")
            if len(artifact_files) > 10:
                print(f"  ... and {len(artifact_files) - 10} more files")
        
        return True
        
    except Exception as e:
        print(f"Pipeline error: {e}")
        return False

# Run the development pipeline
print("Starting AIC FTML Development Pipeline...")
start_time = time.time()

success = run_dev_pipeline()

elapsed_time = time.time() - start_time
print(f"\nPipeline completed in {elapsed_time:.1f} seconds")

if success:
    print("Development pipeline completed successfully!")
    print("Artifacts are ready in ./artifacts/")
else:
    print("Pipeline failed. Check error messages above.")

In [None]:
# Optional: Train Reranker Model
import os
import pathlib
import subprocess

# Configuration
TRAIN_JSONL_PATHS = ['data/train.jsonl', 'data/train_dev.jsonl', 'train.jsonl']  # Check multiple possible locations
MODEL_OUTPUT_PATH = './artifacts/reranker.joblib'

def train_reranker():
    """Train reranker model if training data is available"""
    try:
        # Find training data
        train_file = None
        for path in TRAIN_JSONL_PATHS:
            if pathlib.Path(path).exists():
                train_file = path
                break
        
        if not train_file:
            print("No training JSONL found in:")
            for path in TRAIN_JSONL_PATHS:
                print(f"  - {path}")
            print("Skipping reranker training (will use fusion baseline)")
            return True
        
        print(f"Found training data: {train_file}")
        
        # Count training samples
        try:
            with open(train_file, 'r') as f:
                num_samples = sum(1 for _ in f)
            print(f"Training samples: {num_samples}")
        except Exception as e:
            print(f"Could not count samples: {e}")
        
        # Check if model already exists
        if pathlib.Path(MODEL_OUTPUT_PATH).exists():
            response = input(f"Model already exists at {MODEL_OUTPUT_PATH}. Retrain? (y/N): ")
            if response.lower() != 'y':
                print("Skipping reranker training")
                return True
        
        # Train the model
        print("Training reranker model...")
        cmd = [
            'python', 'src/training/train_reranker.py',
            '--index_dir', './artifacts',
            '--train_jsonl', train_file
        ]
        print(f"Command: {' '.join(cmd)}")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print("Reranker training completed successfully!")
            print(f"Model saved to: {MODEL_OUTPUT_PATH}")
            
            # Show model info
            if pathlib.Path(MODEL_OUTPUT_PATH).exists():
                model_size = pathlib.Path(MODEL_OUTPUT_PATH).stat().st_size
                print(f"Model size: {model_size:,} bytes")
            
            if result.stdout:
                print(f"Training output:\n{result.stdout}")
            
            return True
        else:
            print("Reranker training failed!")
            print(f"Error: {result.stderr}")
            return False
            
    except Exception as e:
        print(f"Training error: {e}")
        return False

# Run training
print("Checking for reranker training...")
train_success = train_reranker()

if train_success:
    print("Reranker training step completed!")
else:
    print("Reranker training failed, but pipeline can continue with fusion baseline.")

In [None]:
# Assemble Pipeline and Test Query
import os
import subprocess
import pathlib
import shutil
import time

# Configuration
PIPELINE_DIR = 'my_pipeline'
TEST_QUERY = 'a person opening a laptop'

def assemble_and_test_pipeline():
    """Assemble minimal pipeline directory and run a test query"""
    try:
        # Step 1: Prepare pipeline directory
        print("Assembling minimal pipeline directory...")
        
        cmd = [
            'python', 'scripts/prepare_pipeline_dir.py',
            '--outdir', PIPELINE_DIR,
            '--artifact_dir', './artifacts',
            '--include_model',
            '--force'
        ]
        print(f"Command: {' '.join(cmd)}")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Pipeline assembly failed: {result.stderr}")
            return False
        
        print("Pipeline directory assembled successfully!")
        
        # Show pipeline contents
        pipeline_path = pathlib.Path(PIPELINE_DIR)
        if pipeline_path.exists():
            print(f"\nPipeline directory contents ({PIPELINE_DIR}/):")
            for item in sorted(pipeline_path.rglob('*')):
                if item.is_file():
                    rel_path = item.relative_to(pipeline_path)
                    size = item.stat().st_size
                    print(f"  FILE {rel_path} ({size:,} bytes)")
                elif item.is_dir() and item != pipeline_path:
                    rel_path = item.relative_to(pipeline_path)
                    file_count = len(list(item.glob('*')))
                    print(f"  DIR  {rel_path}/ ({file_count} files)")
        
        # Step 2: Test the pipeline
        print(f"\nTesting pipeline with query: '{TEST_QUERY}'")
        
        # Change to pipeline directory
        original_dir = os.getcwd()
        os.chdir(PIPELINE_DIR)
        
        try:
            cmd = ['python', 'src/retrieval/use.py', '--query', TEST_QUERY]
            print(f"Command: {' '.join(cmd)}")
            
            start_time = time.time()
            result = subprocess.run(cmd, capture_output=True, text=True)
            elapsed_time = time.time() - start_time
            
            if result.returncode == 0:
                print(f"Test query completed successfully in {elapsed_time:.1f}s!")
                
                # Show results
                submissions_dir = pathlib.Path('submissions')
                if submissions_dir.exists():
                    csv_files = list(submissions_dir.glob('*.csv'))
                    if csv_files:
                        print(f"\nGenerated {len(csv_files)} result file(s):")
                        for csv_file in csv_files:
                            print(f"  - {csv_file}")
                            # Show first few lines
                            try:
                                with open(csv_file, 'r') as f:
                                    lines = f.readlines()[:3]
                                    print(f"    Sample results (first 3 lines):")
                                    for i, line in enumerate(lines, 1):
                                        print(f"    {i}: {line.strip()}")
                                # Count total results
                                with open(csv_file, 'r') as f:
                                    total_lines = sum(1 for _ in f)
                                print(f"    Total results: {total_lines}")
                            except Exception as e:
                                print(f"    (Could not read file: {e})")
                
                if result.stdout:
                    print(f"\nQuery output:\n{result.stdout}")
                
                return True
            else:
                print(f"Test query failed after {elapsed_time:.1f}s!")
                print(f"Error: {result.stderr}")
                return False
                
        finally:
            # Return to original directory
            os.chdir(original_dir)
        
    except Exception as e:
        print(f"Assembly/test error: {e}")
        return False

# Run assembly and test
print("Starting pipeline assembly and testing...")
success = assemble_and_test_pipeline()

if success:
    print(f"\nPipeline assembled and tested successfully!")
    print(f"Ready-to-deploy pipeline is in: {PIPELINE_DIR}/")
    print("\nNext steps:")
    print(f"  1. Upload {PIPELINE_DIR}/ to your deployment environment")
    print("  2. Run queries using: python src/retrieval/use.py --query 'your search'")
    print("  3. Find results in submissions/ folder")
else:
    print(f"\nPipeline assembly or testing failed. Check errors above.")

# Show final summary
print(f"\nDevelopment Summary:")
print(f"  Pipeline directory: {PIPELINE_DIR}/")
print(f"  Test query: '{TEST_QUERY}'")
print(f"  Results location: {PIPELINE_DIR}/submissions/")
print(f"  Model included: {'Yes' if pathlib.Path(f'{PIPELINE_DIR}/artifacts/reranker.joblib').exists() else 'No (using fusion baseline)'}")

## Official Evaluation
Provide your ground truth JSON path and task.

In [None]:
# Configure evaluation
GT_PATH = 'ground_truth.json'   # update path (e.g., /content/drive/MyDrive/gt.json)
TASK_EVAL = 'kis'               # 'kis' or 'vqa' or 'trake'
NORMALIZE_ANS = False           # True to casefold VQA answers

import subprocess
cmd = ['python', 'eval/evaluate.py', '--gt', GT_PATH, '--pred_dir', 'submissions', '--task', TASK_EVAL]
if NORMALIZE_ANS:
    cmd.append('--normalize_answer')
print('Evaluating:', ' '.join(cmd))
res = subprocess.run(cmd, capture_output=True, text=True)
print(res.stdout)
if res.returncode:
    print(res.stderr)
    raise SystemExit(res.returncode)
