In [9]:
import sys
import os
from pathlib import Path

# Add the project root to Python path so we can import from models
project_root = Path().cwd().parent.parent.parent
sys.path.insert(0, str(project_root))

from models.vertex_ai import CloudProcessor, MachineConfig, JobConfig, get_config

# Import the processing functions from local files
from extract_landmarks import extract_landmarks
from extract_transcripts import extract_transcripts
from csv_processor import upload_csv_and_prepare_batch_data

# Get vertex ai configuration
config = get_config("dev")

# Initialize the processor
processor = CloudProcessor(
    project_id=config.project_id,
    location=config.region,
    staging_bucket=f"{config.project_id}-{config.environment}-staging",
    data_bucket=f"{config.project_id}-{config.environment}-data"
)

# Configure job
job_config = JobConfig(
    provisioning_model="SPOT",  # Use spot instances for cost savings
    restart_on_failure=True,
    timeout_days=2.0
)


Loading configuration from /home/steffen/sign-language-translator/models/vertex_ai/config/dev.yaml


In [10]:
# Uploads CSV and creates ~3,800 individual row files for batch processing
num_videos = upload_csv_and_prepare_batch_data(
    csv_file_path="../tagesschau_sign_language_video_links.csv",
    project_id=config.project_id,
    environment=config.environment
)

print(f"\n✅ Prepared {num_videos} videos for processing!")

Reading CSV file: ../tagesschau_sign_language_video_links.csv
Found 3812 rows with valid video URLs
Uploaded CSV to gs://sign-lang-translator-20241029-dev-data/raw-data/tagesschau_sign_language_video_links.csv
Creating individual row files for batch processing...
Uploaded row 0
Uploaded row 100
Uploaded row 200
Uploaded row 300
Uploaded row 400
Uploaded row 500
Uploaded row 600
Uploaded row 700
Uploaded row 800
Uploaded row 900
Uploaded row 1000
Uploaded row 1100
Uploaded row 1200
Uploaded row 1300
Uploaded row 1400
Uploaded row 1500
Uploaded row 1600
Uploaded row 1700
Uploaded row 1800
Uploaded row 1900
Uploaded row 2000
Uploaded row 2100
Uploaded row 2200
Uploaded row 2300
Uploaded row 2400
Uploaded row 2500
Uploaded row 2600
Uploaded row 2700
Uploaded row 2800
Uploaded row 2900
Uploaded row 3000
Uploaded row 3100
Uploaded row 3200
Uploaded row 3300
Uploaded row 3400
Uploaded row 3500
Uploaded row 3600
Uploaded row 3700
Uploaded row 3800
Successfully uploaded 3812 row files to gs://s

In [None]:
# Fix existing JSON files that have NaN values
def fix_existing_json_files(project_id: str, environment: str = "dev"):
    """Fix existing JSON files by replacing NaN values with null."""
    import json
    import pandas as pd
    from google.cloud import storage
    
    client = storage.Client(project=project_id)
    bucket_name = f"{project_id}-{environment}-data"
    bucket = client.bucket(bucket_name)
    
    # List all JSON files in csv-rows/
    blobs = list(bucket.list_blobs(prefix="csv-rows/"))
    json_blobs = [blob for blob in blobs if blob.name.endswith('.json')]
    
    print(f"Found {len(json_blobs)} JSON files to fix")
    
    fixed_count = 0
    for blob in json_blobs:
        try:
            # Download the blob content
            content = blob.download_as_text()
            
            # Fix NaN values by replacing them with null
            fixed_content = content.replace(': NaN', ': null')
            
            # Try to parse the fixed content
            try:
                data = json.loads(fixed_content)
                # If successful, upload the fixed version
                blob.upload_from_string(fixed_content, content_type='application/json')
                fixed_count += 1
                
                if fixed_count % 100 == 0:
                    print(f"Fixed {fixed_count} files")
                    
            except json.JSONDecodeError as e:
                print(f"Could not fix {blob.name}: {e}")
                
        except Exception as e:
            print(f"Error processing {blob.name}: {e}")
    
    print(f"✅ Fixed {fixed_count} JSON files")
    return fixed_count

# Run the fix if you have existing files with NaN values
# fixed_count = fix_existing_json_files(config.project_id, config.environment)


In [None]:
# Verify JSON files are now valid
def verify_json_files(project_id: str, environment: str = "dev", sample_size: int = 5):
    """Verify that JSON files are valid by sampling a few files."""
    import json
    from google.cloud import storage
    import random
    
    client = storage.Client(project=project_id)
    bucket_name = f"{project_id}-{environment}-data"
    bucket = client.bucket(bucket_name)
    
    # List JSON files
    blobs = list(bucket.list_blobs(prefix="csv-rows/"))
    json_blobs = [blob for blob in blobs if blob.name.endswith('.json')]
    
    if not json_blobs:
        print("No JSON files found")
        return False
    
    # Sample some files
    sample_blobs = random.sample(json_blobs, min(sample_size, len(json_blobs)))
    
    valid_count = 0
    for blob in sample_blobs:
        try:
            content = blob.download_as_text()
            data = json.loads(content)
            valid_count += 1
            print(f"✅ {blob.name} - Valid JSON")
            
            # Show sample data from first file
            if valid_count == 1:
                print(f"   Sample keys: {list(data.keys())}")
                print(f"   Video URL: {data.get('webm', 'N/A')}")
                
        except json.JSONDecodeError as e:
            print(f"❌ {blob.name} - Invalid JSON: {e}")
        except Exception as e:
            print(f"❌ {blob.name} - Error: {e}")
    
    success_rate = valid_count / len(sample_blobs)
    print(f"\nResult: {valid_count}/{len(sample_blobs)} files are valid JSON ({success_rate:.1%})")
    return success_rate == 1.0

# Test the JSON files
# verify_json_files(config.project_id, config.environment)


In [5]:
# Run landmark extraction
job = processor.submit_job(
    processing_fn=extract_landmarks.__code__.co_code,
    input_folder="raw-videos/",  # Where your videos are stored
    output_folder="landmarks/",   # Where to save the parquet files
    workers=4,                    # Number of parallel workers
    machine_config=MachineConfig(
        machine_type="n1-standard-8",  # More powerful machine for video processing
        disk_size_gb=200,  # More disk space for video files
    ),
    job_config=job_config,
    batch_size=1,                 # Process one video at a time per worker
    requirements=["mediapipe", "opencv-python", "pandas", "natsort"]
)

Creating CustomJob
CustomJob created. Resource name: projects/788230573749/locations/europe-west3/customJobs/6187776013460570112
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/788230573749/locations/europe-west3/customJobs/6187776013460570112')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/europe-west3/training/6187776013460570112?project=788230573749


Processing files:   0%|          | 0/4 [00:00<?, ?it/s]

CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_QUEUED
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobS

KeyboardInterrupt: 

CustomJob projects/788230573749/locations/europe-west3/customJobs/6187776013460570112 current state:
JobState.JOB_STATE_PENDING


In [None]:
# Run transcript extraction
job = processor.submit_job(
    processing_fn=extract_transcripts.__code__.co_code,
    input_folder="raw-videos/",
    output_folder="transcripts/",
    workers=4,  # Number of parallel workers
    machine_config=MachineConfig(
        machine_type="n1-standard-4",
        disk_size_gb=100,
        accelerator_type="NVIDIA_TESLA_T4",  # GPU for Whisper
        accelerator_count=1
    ),
    job_config=job_config,
    batch_size=1,                 # Process one video at a time per worker
    requirements=[
        "openai-whisper",
        "ffmpeg-python"
    ]
)