# Process Tagesschau Sign Language Videos

This notebook demonstrates how to process videos from the `tagesschau_sign_language_video_links.csv` file using the cloud processing pipeline.

## Workflow:
1. Upload CSV file to Google Cloud Storage
2. Prepare individual row files for batch processing
3. Run cloud processing job to extract landmarks and transcripts
4. Monitor results


In [1]:
import sys
import os
import inspect
from pathlib import Path

# Add the project root to Python path so we can import from models
project_root = Path().cwd().parent.parent.parent
sys.path.insert(0, str(project_root))

from models.vertex_ai import CloudProcessor, MachineConfig, JobConfig, get_config

# Import the new processing functions
from csv_processor import upload_csv_and_prepare_batch_data, list_csv_rows
from process_csv_row import process_csv_row

# Get vertex ai configuration
config = get_config("dev")
print(f"Project ID: {config.project_id}")
print(f"Region: {config.region}")


Environment variables loaded from .env
Loading configuration from /home/steffen/sign-language-translator/models/vertex_ai/config/dev.yaml
Project ID: sign-lang-translator-20241029
Region: europe-west3


In [None]:
# Path to your CSV file
csv_file_path = "../tagesschau_sign_language_video_links.csv"

# Upload CSV and prepare batch data
print("Uploading CSV file and preparing batch data...")
num_videos = upload_csv_and_prepare_batch_data(
    csv_file_path=csv_file_path,
    project_id=config.project_id,
    environment=config.environment
)

print(f"\n✅ Prepared {num_videos} videos for processing!")


In [2]:
# Initialize the processor
processor = CloudProcessor(
    project_id=config.project_id,
    location=config.region,
    staging_bucket=f"{config.project_id}-{config.environment}-staging",
    data_bucket=f"{config.project_id}-{config.environment}-data"
)

# Configure job for video processing
job_config = JobConfig(
    provisioning_model="SPOT",  # Use spot instances for cost savings
    restart_on_failure=True,
    timeout_days=7.0  # Allow more time for large video processing
)

# Configure machines for video processing (need more power for Whisper)
machine_config = MachineConfig(
    machine_type="n1-standard-8",  # 8 vCPUs, 30GB RAM
    disk_size_gb=200,  # Large disk for video downloads
)

# Test mode settings
TEST_MODE = True  # Set to False to process all videos

if TEST_MODE:
    print("🧪 Running in TEST MODE - will process limited videos")
    workers = 1
    batch_size = 1
else:
    print("🚀 Running in FULL MODE - processing all videos")
    workers = 10  # More workers for full processing
    batch_size = 1  # One video per batch (videos are large)

print(f"Workers: {workers}, Batch size: {batch_size}")


🧪 Running in TEST MODE - will process limited videos
Workers: 1, Batch size: 1


In [None]:

# Submit the processing job
print("🚀 Submitting cloud processing job...")

# Get the source code of the processing function
import inspect
processing_fn_source = inspect.getsource(process_csv_row)

job = processor.submit_job(
    processing_fn=processing_fn_source,
    input_folder="csv-rows/",  # Where the CSV row files are stored
    output_folder="processed-videos/",  # Where to save results
    workers=workers,
    machine_config=machine_config,
    job_config=job_config,
    batch_size=batch_size,
    requirements=[
        "mediapipe", 
        "opencv-python", 
        "pandas", 
        "natsort",
        "openai-whisper",
        "ffmpeg-python",
        "requests",
    ]
)

print("\n✅ Job submitted successfully!")
print(f"Job resource name: {job.resource_name}")


🚀 Submitting cloud processing job...


Creating CustomJob
CustomJob created. Resource name: projects/788230573749/locations/europe-west3/customJobs/8941867923085983744
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/788230573749/locations/europe-west3/customJobs/8941867923085983744')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/europe-west3/training/8941867923085983744?project=788230573749
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING


Processing files: 100%|██████████| 5243/5243 [00:03<00:00, 1489.59it/s]

CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING


Processing files: 100%|██████████| 5243/5243 [00:08<00:00, 614.12it/s] 


CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/8941867923085983744 current state:
Job