# Batch Processing Example
This notebook demonstrates how to use the CloudProcessor for batch processing data with single or multiple workers.

## 1. Configuration
Set the project root path:

In [None]:
import os
from pathlib import Path
import json
from typing import Dict
import sys

def setup_project_path():
    """Add project root to Python path by searching for .git directory"""
    current_path = Path.cwd()
    
    # Search up the directory tree for .git folder or pyproject.toml
    root_indicators = ['.git', 'pyproject.toml']
    
    while current_path != current_path.parent:
        if any((current_path / indicator).exists() for indicator in root_indicators):
            sys.path.append(str(current_path))
            return current_path
        current_path = current_path.parent
    
    raise RuntimeError(
        "Could not find project root. "
        "Please run this notebook from within the project directory."
    )

# Setup path
project_root = setup_project_path()
print(f"Project root detected at: {project_root}")

Set your vertex ai configuration:

In [None]:
from models.vertex_ai import CloudProcessor, MachineConfig, JobConfig, get_config

# Get vertex ai configuration for 'dev' environment
config = get_config("dev")

# Initialize the processor with both the staging and data bucket
processor = CloudProcessor(
    project_id=config.project_id,
    location=config.region,
    staging_bucket=f"{config.project_id}-{config.environment}-staging",
    data_bucket=f"{config.project_id}-{config.environment}-data"
)


## 2. Define Processing Function
First, let's define a sample processing function that will be executed on each worker:

In [3]:
processing_function = """
def process_single_video(input_path: str, temp_dir: str) -> str:
    '''Sample processing function that simulates video processing.
    In practice, replace this with your actual processing logic.'''
    import time
    import os
    
    # Simulate processing time
    time.sleep(2)
    
    # Create a dummy output file
    output_path = os.path.join(temp_dir, 'processed_' + os.path.basename(input_path))
    with open(output_path, 'w') as f:
        f.write('Processed content')
    
    return output_path
"""

## 3. Single Worker Example
Process all files sequentially with a single worker using a given batch size:

In [None]:
# Configure machine resources
machine_config = MachineConfig(
    machine_type="n1-standard-4",
    disk_size_gb=100
)

# Configure job parameters
job_config = JobConfig(
    provisioning_model="SPOT",   # "SPOT" means the job will run on preemptible instances, which are cheaper but may be interrupted at any time. For time critical jobs you may use "DEDICATED"
    restart_on_failure=True,
    timeout_days=1.0
)

# Submit single-worker job
single_worker_job = processor.submit_job(
    processing_fn=processing_function,
    input_folder="raw-data/",  # Folder for input
    output_folder="landmarks/",  # Folder for output
    job_config=job_config,
    batch_size=10
)


## 4. Multiple Workers Example
Process files in parallel using multiple workers:

In [None]:
# Optional: Configure GPU for faster processing
gpu_machine_config = MachineConfig(
    machine_type="n1-standard-8",
    disk_size_gb=200
)

# Configure job to use spot instances for cost savings
spot_job_config = JobConfig(
    provisioning_model="SPOT",
    restart_on_failure=True,
    timeout_days=2.0
)

# For multi-worker job with custom buckets
multi_worker_job = processor.submit_job(
    processing_fn=processing_function,
    input_folder="raw-data/",
    output_folder="landmarks/",
    input_bucket="custom-input-bucket",  # Optional override
    output_bucket="custom-output-bucket", # Optional override
    workers=4,
    machine_config=gpu_machine_config,
    job_config=spot_job_config,
    batch_size=5
)
