# Building Enrichment Pipeline - Simple Job Creator

**Quick Start:**
1. Replace `data/NOS_storey_mapping.csv` with your country-specific file
2. Update `ISO3` in Cell 2 to your country code
3. Run all cells
4. Monitor job progress

**The job will automatically:**
- Use files from `data/` folder (tsi.csv, admin boundaries already included)
- Create {ISO3}/input/, {ISO3}/output/, {ISO3}/logs/ folders
- Copy files to correct locations
- Generate full config.json with ISO3 suffixes
- Run the complete pipeline

**Note:** All required data files (except your NOS file) are already in the `data/` folder!

## Step 1: Install Required Packages

In [None]:
# Auto-install notebook dependencies
try:
    import databricks.sdk
    import yaml
    print("‚úÖ All dependencies available")
except ImportError:
    print("Installing packages...")
    %pip install databricks-sdk pyyaml --quiet
    dbutils.library.restartPython()

## Step 2: Configuration (EDIT THIS!)

In [None]:
# ============================================================================
# USER CONFIGURATION - Edit these values
# ============================================================================

# Run mode: "test" or "full"
# - test: Process only 1 tile with 10k grid cells for quick validation
# - full: Process all tiles for complete country coverage
RUN_MODE = "test"  # Change to "full" for production run

# Country code (CHANGE THIS for your country)
ISO3 = "IND"

# Databricks settings
CATALOG = "prp_mr_bdap_projects"
SCHEMA = "geospatialsolutions"
VOLUME_BASE = "/Volumes/prp_mr_bdap_projects/geospatialsolutions/external/jrc/data"

# Workspace path (where these scripts are located)
WORKSPACE_BASE = "/Workspace/Users/npokkiri@munichre.com/inventory_nos_db/code-for-copilot-main/mre/job1"

# ============================================================================
# Input files from data/ folder
# Just replace NOS_storey_mapping.csv in the data/ folder with your file!
# ============================================================================
PROPORTIONS_CSV = f"{WORKSPACE_BASE}/data/NOS_storey_mapping.csv"
TSI_CSV = f"{WORKSPACE_BASE}/data/tsi.csv"
ADMIN_BOUNDARIES = f"{WORKSPACE_BASE}/data/RMS_Admin0_geozones.json.gz"

# Optional: Email for notifications
EMAIL = "npokkiri@munichre.com"

# Optional: Cluster ID (leave empty to auto-detect)
CLUSTER_ID = ""  # Will auto-detect current cluster if empty

# ============================================================================
# Processing parameters (optional - defaults provided)
# ============================================================================
CELL_SIZE = 2000              # Grid cell size in meters (2km default)
DOWNLOAD_CONCURRENCY = 3      # Parallel tile downloads
MAX_WORKERS = 8               # Raster processing threads
TILE_PARALLELISM = 4          # Concurrent tile processing

# Test mode overrides (automatically set if RUN_MODE="test")
if RUN_MODE.lower() == "test":
    SAMPLE_SIZE = 10000        # Limit to 10k grid cells
    MAX_TILES = 1              # Process only 1 tile
    print("‚ö†Ô∏è  TEST MODE: Will process only 1 tile with 10k grid cells")
else:
    SAMPLE_SIZE = None         # No limit - process all
    MAX_TILES = None           # Process all tiles
    print("‚úÖ FULL MODE: Will process all tiles for complete coverage")

## Step 3: Initialize & Auto-Detect Cluster

In [None]:
import sys
from pyspark.sql import SparkSession

# Add workspace base to path for helper imports
sys.path.insert(0, WORKSPACE_BASE.replace("/Workspace", "/Workspace"))

# Initialize Spark
spark = SparkSession.builder.getOrCreate()

# Auto-detect cluster if not specified
if not CLUSTER_ID:
    CLUSTER_ID = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")
    print(f"üîç Auto-detected cluster ID: {CLUSTER_ID}")
else:
    print(f"üìå Using specified cluster ID: {CLUSTER_ID}")

print(f"‚úÖ Configuration loaded for {ISO3}")

## Step 4: Generate Minimal Config

In [None]:
from config_generator import generate_minimal_config, save_config_to_workspace

print("‚öôÔ∏è  Generating minimal configuration...")

# Generate config using helper
minimal_config = generate_minimal_config(
    iso3=ISO3,
    catalog=CATALOG,
    schema=SCHEMA,
    volume_base=VOLUME_BASE,
    workspace_base=WORKSPACE_BASE,
    proportions_csv=PROPORTIONS_CSV,
    tsi_csv=TSI_CSV,
    admin_boundaries=ADMIN_BOUNDARIES,
    run_mode=RUN_MODE,
    cell_size=CELL_SIZE,
    download_concurrency=DOWNLOAD_CONCURRENCY,
    max_workers=MAX_WORKERS,
    tile_parallelism=TILE_PARALLELISM,
    sample_size=SAMPLE_SIZE,
    max_tiles=MAX_TILES
)

# Save config to workspace
workspace_config_path = save_config_to_workspace(minimal_config, ISO3, WORKSPACE_BASE, dbutils)

print(f"‚úÖ Minimal config created")
print(f"üìç Location: {workspace_config_path}")
print(f"üîß Run mode: {RUN_MODE.upper()}")
if RUN_MODE.lower() == "test":
    print(f"   - Sample size: {SAMPLE_SIZE:,} grid cells")
    print(f"   - Max tiles: {MAX_TILES}")
print(f"\nThis will be used by Task 0 to generate full config.json")

## Step 5: Create Databricks Job

In [None]:
from job_creator import create_databricks_job

print("üî® Creating Databricks job...")

# Create job using helper
JOB_ID, job_name = create_databricks_job(
    iso3=ISO3,
    cluster_id=CLUSTER_ID,
    workspace_base=WORKSPACE_BASE,
    catalog=CATALOG,
    schema=SCHEMA,
    volume_base=VOLUME_BASE,
    minimal_config_path=workspace_config_path,
    email=EMAIL
)

print(f"‚úÖ Job created successfully!")
print(f"   Job ID: {JOB_ID}")
print(f"   Job Name: {job_name}")
print(f"   Tasks: 8 (including Task 0 setup)")

## Step 6: Run Job & Monitor Progress

In [None]:
from databricks.sdk import WorkspaceClient
from job_monitor import monitor_job_progress

w = WorkspaceClient()

print(f"üöÄ Starting job {JOB_ID}...")

# Run the job
run = w.jobs.run_now(job_id=JOB_ID)
RUN_ID = run.run_id

print(f"‚úÖ Job started!")
print(f"   Run ID: {RUN_ID}")
print()

# Monitor progress using helper
success, duration, result_state = monitor_job_progress(RUN_ID, update_interval=30)

## Step 7: Verify Outputs

In [None]:
print("üîç Verifying outputs...\n")

# Check main output table
output_table = f"{CATALOG}.{SCHEMA}.building_enrichment_output_{ISO3}"

try:
    df = spark.table(output_table)
    count = df.count()
    print(f"‚úÖ Main output table exists: {output_table}")
    print(f"   Row count: {count:,}\n")
    print(f"   Sample data:")
    display(df.limit(5))
except Exception as e:
    print(f"‚ö†Ô∏è  Could not verify table: {e}")

print(f"\nüìä Export files location: {VOLUME_BASE}/{ISO3}/output/exports/FULL_{ISO3}/")

## Summary

In [None]:
generated_config_path = f"{VOLUME_BASE}/{ISO3}/config.json"

print("="*60)
print("PIPELINE EXECUTION SUMMARY")
print("="*60)
print(f"Country: {ISO3}")
print(f"Job ID: {JOB_ID}")
print(f"Run ID: {RUN_ID}")
print(f"")
print(f"üìÅ Data Location: {VOLUME_BASE}/{ISO3}")
print(f"üìä Main Output Table: {output_table}")
print(f"üìÇ Exports: {VOLUME_BASE}/{ISO3}/output/exports/FULL_{ISO3}/")
print(f"‚öôÔ∏è  Config: {generated_config_path}")
print(f"")
print(f"View job in Databricks UI: Workflows ‚Üí Jobs ‚Üí Building_Enrichment_{ISO3}")
print("="*60)