# 01 - Raw (Bronze) Layer Ingestion

Ingest earthquake data from USGS API into the bronze layer.

**Key Concepts:**
- Custom PySpark DataSource for USGS API
- Idempotent writes using Delta Lake merge
- Parameterized notebooks with widgets

**Table:** `{catalog}.{schema}.raw_events`

## Setup

In [None]:
from datetime import datetime, timedelta

# Set default dates
default_start_date=(datetime.now()- timedelta(days=2)).strftime('%Y-%m-%d')
default_end_date=(datetime.now()- timedelta(days=1)).strftime('%Y-%m-%d')

In [None]:
# Parameters - these become Databricks widgets
dbutils.widgets.text("catalog", "earthquakes_dev", "Catalog")
dbutils.widgets.text("schema", "usgs", "Schema")
dbutils.widgets.text("start_date", "", "Start Date (YYYY-MM-DD)")
dbutils.widgets.text("end_date", "", "End Date (YYYY-MM-DD)")
dbutils.widgets.text("num_partitions", "8", "Number of Partitions")
dbutils.widgets.dropdown("write_mode", "merge", ["merge", "overwrite", "append"], "Write Mode")

In [None]:
# Get parameters
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
start_date = dbutils.widgets.get("start_date") if dbutils.widgets.get("start_date")!="" else default_start_date
end_date = dbutils.widgets.get("end_date") if dbutils.widgets.get("end_date")!="" else default_end_date
num_partitions = dbutils.widgets.get("num_partitions")
write_mode = dbutils.widgets.get("write_mode")

print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
print(f"Write Mode: {write_mode}")

In [None]:
# Import utilities from the installed wheel
from utils.helpers import (
            get_or_create_catalog_schema,
            get_table_path,
            write_delta_table_with_cdf,
            add_metadata_columns,
            print_table_stats,
        )

from utils.datasource import register_usgs_datasource

In [None]:
# Ensure catalog and schema exist
get_or_create_catalog_schema(spark, catalog, schema)

In [None]:
# Register the USGS DataSource
register_usgs_datasource(spark)

## Fetch Data from USGS API

In [None]:
# Ingestion date range
print(f"Ingesting data from {start_date} to {end_date}")

In [None]:
# Read from USGS API using custom DataSource
df_raw = spark.read.format("usgs") \
    .option("starttime", start_date) \
    .option("endtime", end_date) \
    .option("numPartitions", num_partitions) \
    .load()

print(f"Fetched {df_raw.count():,} events from USGS API")

In [None]:
# Preview the data
df_raw.show(5, truncate=50)

In [None]:
# Schema shows MapType for properties and geometry
df_raw.printSchema()

In [None]:
# Add metadata columns
df_raw = add_metadata_columns(df_raw)
df_raw.printSchema()

## Write to Delta Table

In [None]:
# Define table path
table_path = get_table_path(catalog, schema, "bronze_events")
print(f"Target table: {table_path}")

In [None]:
# Write to Delta table (idempotent with merge, CDF enabled for downstream incremental processing)
record_count = write_delta_table_with_cdf(
    df=df_raw,
    table_path=table_path,
    mode=write_mode,
    merge_keys=["id"],  # Event ID is unique
    enable_cdf=True
)

In [None]:
# Show table statistics
print_table_stats(spark, table_path)

## Verify Results

In [None]:
# Quick validation query
spark.sql(f"""
    SELECT 
        COUNT(*) as total_events,
        MIN(_ingested_at) as earliest_ingestion,
        MAX(_ingested_at) as latest_ingestion
    FROM {table_path}
""").show()

In [None]:
# Return record count for job orchestration
dbutils.notebook.exit(str(record_count))