# Building Enrichment Pipeline - Simple Job Creator

**Instructions:**
1. Fill out the configuration in Cell 2 below
2. Run all cells
3. Monitor job progress

**The job will automatically:**
- Create {ISO3}/input/, {ISO3}/output/, {ISO3}/logs/ folders
- Copy files to correct locations
- Generate full config.json with ISO3 suffixes
- Run the complete pipeline

## Step 1: Install Required Packages

In [None]:
# Auto-install notebook dependencies
try:
    import databricks.sdk
    import yaml
    print("‚úÖ All dependencies available")
except ImportError:
    print("Installing packages...")
    %pip install databricks-sdk pyyaml --quiet
    dbutils.library.restartPython()

## Step 2: Configuration (EDIT THIS!)

In [None]:
# ============================================================================
# USER CONFIGURATION - Edit these values
# ============================================================================

# Country code
ISO3 = "IND"

# Databricks settings
CATALOG = "prp_mr_bdap_projects"
SCHEMA = "geospatialsolutions"
VOLUME_BASE = "/Volumes/prp_mr_bdap_projects/geospatialsolutions/external/jrc/data"

# Input file paths (full paths)
PROPORTIONS_CSV = "/Workspace/Users/npokkiri@munichre.com/inventory_nos_db/data/IND_NOS_storey_mapping_041125.csv"
TSI_CSV = "/Volumes/prp_mr_bdap_projects/geospatialsolutions/external/jrc/data/inputs/multipliers/tsi.csv"
ADMIN_BOUNDARIES = "/Volumes/prp_mr_bdap_projects/geospatialsolutions/external/jrc/data/inputs/admin/RMS_Admin0_geozones.gpkg"

# Workspace path (where these scripts are located)
WORKSPACE_BASE = "/Workspace/Users/npokkiri@munichre.com/inventory_nos_db/code-for-copilot-main/mre/job1"

# Optional: Email for notifications
EMAIL = "npokkiri@munichre.com"

# Optional: Cluster ID (leave empty to auto-detect)
CLUSTER_ID = ""  # Will auto-detect current cluster if empty

# ============================================================================
# Processing parameters (optional - defaults provided)
# ============================================================================
CELL_SIZE = 2000              # Grid cell size in meters
DOWNLOAD_CONCURRENCY = 3      # Parallel tile downloads
MAX_WORKERS = 8               # Raster processing threads
TILE_PARALLELISM = 4          # Concurrent tile processing

## Step 3: Initialize & Auto-Detect Cluster

In [None]:
import json
import time
import yaml
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.jobs import Task, TaskDependency, SparkPythonTask, Library
from pyspark.sql import SparkSession

# Initialize
spark = SparkSession.builder.getOrCreate()
w = WorkspaceClient()

# Auto-detect cluster if not specified
if not CLUSTER_ID:
    CLUSTER_ID = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")
    print(f"üîç Auto-detected cluster ID: {CLUSTER_ID}")
else:
    print(f"üìå Using specified cluster ID: {CLUSTER_ID}")

print(f"‚úÖ Configuration loaded for {ISO3}")

## Step 4: Generate Minimal Config

In [None]:
# Create minimal config (like config.yaml)
minimal_config = {
    "project": {
        "catalog": CATALOG,
        "schema": SCHEMA,
        "volume_root": VOLUME_BASE
    },
    "country": {
        "iso3": ISO3
    },
    "inputs": {
        "proportions_csv": PROPORTIONS_CSV,
        "tsi_csv": TSI_CSV,
        "admin_boundaries": ADMIN_BOUNDARIES,
        "tile_footprint": f"{WORKSPACE_BASE}/ghsl2_0_mwd_l1_tile_schema_land.gpkg"
    },
    "params": {
        "cell_size": CELL_SIZE,
        "export_crs": "EPSG:4326",
        "target_crs": "ESRI:54009",
        "datasets": "built_c,smod",
        "download_concurrency": DOWNLOAD_CONCURRENCY,
        "download_retries": 2,
        "use_smod": True,
        "use_boundary_mask": True,
        "include_nodata": True,
        "add_percentages": False,
        "chunk_size": 10000,
        "max_workers": MAX_WORKERS,
        "tile_parallelism": TILE_PARALLELISM,
        "sample_size": 10000,
        "stage_to_local": True,
        "local_dir": "/local_disk0/raster_cache",
        "spark_tmp_dir": "/tmp/job3_grid_tmp"
    },
    "flags": {
        "dry_run": False,
        "preview": True,
        "preview_rows": 5,
        "overwrite_schema": True,
        "write_mode": "overwrite",
        "csv_infer_schema": True,
        "save_temp_csv": False,
        "save_per_tile": False
    },
    "workspace_base": WORKSPACE_BASE
}

# Save minimal config to temp location
temp_config_path = f"/tmp/minimal_config_{ISO3}.yaml"
with open(temp_config_path, 'w') as f:
    yaml.dump(minimal_config, f)

# Upload to workspace
workspace_config_path = f"{WORKSPACE_BASE}/temp_minimal_config_{ISO3}.yaml"
dbutils.fs.cp(f"file:{temp_config_path}", f"file:{workspace_config_path}", recurse=True)

print(f"‚úÖ Minimal config created")
print(f"üìç Location: {workspace_config_path}")
print(f"\nThis will be used by Task 0 to generate full config.json")

## Step 5: Create Databricks Job

In [None]:
print("üî® Creating Databricks job...")

job_name = f"Building_Enrichment_{ISO3}"
requirements_path = f"{WORKSPACE_BASE}/requirements.txt"

# Config path that Task 0 will generate
generated_config_path = f"{VOLUME_BASE}/{ISO3}/config.json"

# Define all tasks
tasks = [
    # Task 0: Setup & config generation
    Task(
        task_key="task0_setup",
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task0_setup.py",
            parameters=["--minimal_config", workspace_config_path]
        ),
        libraries=[
            Library(requirements=requirements_path)
        ]
    ),
    # Task 1-7: Pipeline tasks (use config from Task 0)
    Task(
        task_key="task1_proportions_to_delta",
        depends_on=[TaskDependency(task_key="task0_setup")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task1_proportions_to_delta.py",
            parameters=["--config_path", generated_config_path]
        ),
        libraries=[Library(requirements=requirements_path)]
    ),
    Task(
        task_key="task2_grid_generation",
        depends_on=[TaskDependency(task_key="task1_proportions_to_delta")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task2_grid_generation.py",
            parameters=["--config_path", generated_config_path]
        ),
        libraries=[Library(requirements=requirements_path)]
    ),
    Task(
        task_key="task3_tile_downloader",
        depends_on=[TaskDependency(task_key="task2_grid_generation")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task3_tile_downloader.py",
            parameters=["--config_path", generated_config_path]
        ),
        libraries=[Library(requirements=requirements_path)]
    ),
    Task(
        task_key="task4_raster_stats",
        depends_on=[TaskDependency(task_key="task3_tile_downloader")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task4_raster_stats.py",
            parameters=["--config_path", generated_config_path]
        ),
        libraries=[Library(requirements=requirements_path)]
    ),
    Task(
        task_key="task5_post_processing",
        depends_on=[TaskDependency(task_key="task4_raster_stats")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task5_post_processing.py",
            parameters=["--config_path", generated_config_path]
        ),
        libraries=[Library(requirements=requirements_path)]
    ),
    Task(
        task_key="task6_create_views",
        depends_on=[TaskDependency(task_key="task5_post_processing")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task6_create_views.py",
            parameters=["--config_path", generated_config_path]
        ),
        libraries=[Library(requirements=requirements_path)]
    ),
    Task(
        task_key="task7_export",
        depends_on=[TaskDependency(task_key="task6_create_views")],
        existing_cluster_id=CLUSTER_ID,
        spark_python_task=SparkPythonTask(
            python_file=f"{WORKSPACE_BASE}/task7_export.py",
            parameters=["--config_path", generated_config_path, "--iso3", ISO3]
        ),
        libraries=[Library(requirements=requirements_path)]
    )
]

# Create job
job = w.jobs.create(
    name=job_name,
    tasks=tasks,
    max_concurrent_runs=1,
    timeout_seconds=0,
    email_notifications={
        "on_success": [EMAIL],
        "on_failure": [EMAIL]
    } if EMAIL else None
)

JOB_ID = job.job_id
print(f"‚úÖ Job created successfully!")
print(f"   Job ID: {JOB_ID}")
print(f"   Job Name: {job_name}")
print(f"   Tasks: {len(tasks)} (including Task 0 setup)")

## Step 6: Run Job & Monitor Progress

In [None]:
print(f"üöÄ Starting job {JOB_ID}...")

# Run the job
run = w.jobs.run_now(job_id=JOB_ID)
RUN_ID = run.run_id

print(f"‚úÖ Job started!")
print(f"   Run ID: {RUN_ID}")
print(f"")
print(f"‚è≥ Monitoring job progress...")
print(f"   (Updates every 30 seconds)")
print(f"")

# Monitor job progress
start_time = time.time()
last_state = None
last_task_status = {}

while True:
    run_info = w.jobs.get_run(run_id=RUN_ID)
    state = run_info.state
    life_cycle_state = state.life_cycle_state.value
    
    # Print state changes
    if life_cycle_state != last_state:
        elapsed = int(time.time() - start_time)
        print(f"[{elapsed}s] Job status: {life_cycle_state}")
        last_state = life_cycle_state
    
    # Print task progress
    if run_info.tasks:
        for task in run_info.tasks:
            task_key = task.task_key
            task_state = task.state.life_cycle_state.value if task.state else "PENDING"
            
            if task_key not in last_task_status or last_task_status[task_key] != task_state:
                elapsed = int(time.time() - start_time)
                status_icon = "‚è≥" if task_state == "RUNNING" else "‚úÖ" if task_state == "TERMINATED" else "‚è∏Ô∏è"
                print(f"[{elapsed}s] {status_icon} {task_key}: {task_state}")
                last_task_status[task_key] = task_state
    
    # Check if job is done
    if life_cycle_state in ["TERMINATED", "INTERNAL_ERROR", "SKIPPED"]:
        result_state = state.result_state.value if state.result_state else "UNKNOWN"
        elapsed = int(time.time() - start_time)
        
        if result_state == "SUCCESS":
            print(f"")
            print(f"‚úÖ Job completed successfully!")
            print(f"   Duration: {elapsed // 60}m {elapsed % 60}s")
        else:
            print(f"")
            print(f"‚ùå Job failed with state: {result_state}")
            print(f"   Duration: {elapsed // 60}m {elapsed % 60}s")
            if state.state_message:
                print(f"   Error: {state.state_message}")
        break
    
    # Wait before next check
    time.sleep(30)

## Step 7: Verify Outputs

In [None]:
print("üîç Verifying outputs...\n")

# Check main output table
output_table = f"{CATALOG}.{SCHEMA}.building_enrichment_output_{ISO3}"

try:
    df = spark.table(output_table)
    count = df.count()
    print(f"‚úÖ Main output table exists: {output_table}")
    print(f"   Row count: {count:,}\n")
    print(f"   Sample data:")
    display(df.limit(5))
except Exception as e:
    print(f"‚ö†Ô∏è  Could not verify table: {e}")

print(f"\nüìä Export files location: {VOLUME_BASE}/{ISO3}/output/exports/FULL_{ISO3}/")

## Summary

In [None]:
print("="*60)
print("PIPELINE EXECUTION SUMMARY")
print("="*60)
print(f"Country: {ISO3}")
print(f"Job ID: {JOB_ID}")
print(f"Run ID: {RUN_ID}")
print(f"")
print(f"üìÅ Data Location: {VOLUME_BASE}/{ISO3}")
print(f"üìä Main Output Table: {output_table}")
print(f"üìÇ Exports: {VOLUME_BASE}/{ISO3}/output/exports/FULL_{ISO3}/")
print(f"‚öôÔ∏è  Config: {generated_config_path}")
print(f"")
print(f"View job in Databricks UI: Workflows ‚Üí Jobs ‚Üí {job_name}")
print("="*60)