# Batch Processing Example
This notebook demonstrates how to use the CloudProcessor for batch processing data with single or multiple workers.

## 1. Configuration
Set the project root path:

In [1]:
import os
from pathlib import Path
import json
from typing import Dict
import sys

def setup_project_path():
    """Add project root to Python path by searching for .git directory"""
    current_path = Path.cwd()
    
    # Search up the directory tree for .git folder or pyproject.toml
    root_indicators = ['.git', 'pyproject.toml']
    
    while current_path != current_path.parent:
        if any((current_path / indicator).exists() for indicator in root_indicators):
            sys.path.append(str(current_path))
            return current_path
        current_path = current_path.parent
    
    raise RuntimeError(
        "Could not find project root. "
        "Please run this notebook from within the project directory."
    )

# Setup path
project_root = setup_project_path()
print(f"Project root detected at: {project_root}")

Project root detected at: /home/steffen/sign-language-translator


Set your vertex ai configuration:

In [2]:
from models.vertex_ai.cloud_processor import CloudProcessor, MachineConfig, JobConfig
from models.vertex_ai import get_config

try:
    vertex_ai_config = get_config("dev") # Get vertex ai configuration for 'dev' environment
    
    # Access config properties directly
    config = {
        'environment': vertex_ai_config.environment,
        'project_id': vertex_ai_config.project_id,
        'region': vertex_ai_config.region,
        'staging_bucket': f"gs://{vertex_ai_config.project_id}-training-data-dev"
    }
    
    print("Configuration loaded successfully!")
    print("\nEnvironment settings:")
    print(json.dumps(config, indent=2))
except ValueError as e:
    print(f"Error: {e}")
    print("\nPlease set the required environment variables before continuing.")

# Initialize the processor with the training data bucket as staging bucket
processor = CloudProcessor(
    project_id=config['project_id'],
    location=config['region'],
    staging_bucket=config['staging_bucket']
)

Environment variables loaded from .env
Loading configuration from /home/steffen/sign-language-translator/models/vertex_ai/config/dev.yaml
Configuration loaded successfully!

Environment settings:
{
  "environment": "dev",
  "project_id": "sign-lang-translator-20241029",
  "region": "europe-west3",
  "staging_bucket": "gs://sign-lang-translator-20241029-training-data-dev"
}


## Define Processing Function
First, let's define a sample processing function that will be executed on each worker:

In [3]:
processing_function = """
def process_single_video(input_path: str, temp_dir: str) -> str:
    '''Sample processing function that simulates video processing.
    In practice, replace this with your actual processing logic.'''
    import time
    import os
    
    # Simulate processing time
    time.sleep(2)
    
    # Create a dummy output file
    output_path = os.path.join(temp_dir, 'processed_' + os.path.basename(input_path))
    with open(output_path, 'w') as f:
        f.write('Processed content')
    
    return output_path
"""

## Single Worker Example
Process all files sequentially with a single worker:

In [None]:
# Configure machine resources
machine_config = MachineConfig(
    machine_type="n1-standard-4",
    disk_size_gb=100
)

# Configure job parameters
job_config = JobConfig(
    provisioning_model="DEDICATED",   # "SPOT" means the job will run on preemptible instances, which are cheaper but may be interrupted at any time. For time critical jobs you may use "DEDICATED"
    restart_on_failure=True,
    timeout_days=1.0
)

# Submit single-worker job
single_worker_job = processor.submit_job(
    processing_fn=processing_function,
    input_bucket="${PROJECT_ID}-${ENV}-raw-data",  # The name of the input bucket where files to be processed are stored
    output_bucket="${PROJECT_ID}-${ENV}-landmarks",  # The name of the output bucket where processed files will be saved
    job_config=job_config,
    batch_size=10  # Process 10 files at a time
)

## Multiple Workers Example with GPU Usage
Process files in parallel using multiple workers:

In [5]:
# Optional: Configure GPU for faster processing
gpu_machine_config = MachineConfig(
    machine_type="n1-standard-8",
    accelerator_type="NVIDIA_TESLA_T4", # Added GPU
    accelerator_count=1, # Added number of GPUs
    disk_size_gb=200
)

# Configure job to use spot instances for cost savings
spot_job_config = JobConfig(
    provisioning_model="SPOT",
    restart_on_failure=True,
    timeout_days=2.0
)

# Submit multi-worker job
multi_worker_job = processor.submit_job(
    processing_fn=processing_function,
    input_bucket="${PROJECT_ID}-${ENV}-raw-data",  # The name of the input bucket where files to be processed are stored
    output_bucket="${PROJECT_ID}-${ENV}-landmarks",  # The name of the output bucket where processed files will be saved
    workers=4,  # Use 4 parallel workers
    machine_config=gpu_machine_config,
    job_config=spot_job_config,
    batch_size=5  # Each worker processes 5 files at a time
)

Creating CustomJob


InvalidArgument: 400 List of found errors:	1.Field: job_spec.worker_pool_specs; Message: Replica count for master worker pool (worker_pool_specs[0].replica_count) should be 1.	 [field_violations {
  field: "job_spec.worker_pool_specs"
  description: "Replica count for master worker pool (worker_pool_specs[0].replica_count) should be 1."
}
]