# 02 - Silver (Enriched) Layer Transformation

Transform raw earthquake data into typed, enriched format.

**Key Concepts:**
- Schema enforcement and type casting
- Extracting from MapType columns
- Data quality and deduplication
- Incremental processing

**Source:** `{catalog}.{schema}.bronze_events`  
**Target:** `{catalog}.{schema}.silver_events`

## Setup

In [None]:
# Parameters
dbutils.widgets.text("catalog", "earthquakes_dev", "Catalog")
dbutils.widgets.text("schema", "usgs", "Schema")
dbutils.widgets.dropdown("write_mode", "merge", ["merge", "overwrite"], "Write Mode")

In [None]:
# Get parameters
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
write_mode = dbutils.widgets.get("write_mode")

print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Write Mode: {write_mode}")

In [None]:
from pyspark.sql.functions import (
    col, from_unixtime, current_timestamp,
    when, regexp_extract, trim, element_at
)
from pyspark.sql.types import (
    DoubleType, IntegerType, TimestampType, LongType
)

from utils.helpers import (
    get_table_path,
    write_delta_table_with_cdf,
    print_table_stats,
    table_exists,
    read_incremental_or_full,
    save_checkpoint,
    get_current_table_version
)

In [None]:
# Define table paths
source_table = get_table_path(catalog, schema, "bronze_events")
target_table = get_table_path(catalog, schema, "silver_events")
checkpoint_table = get_table_path(catalog, schema, "_checkpoints")

print(f"Source: {source_table}")
print(f"Target: {target_table}")
print(f"Checkpoint: {checkpoint_table}")

## Read Bronze Data

In [None]:
# Read incrementally using CDF (only new/changed records since last run)
# Falls back to full read on first run or if CDF not available
df, source_version, is_incremental = read_incremental_or_full(
    spark, source_table, checkpoint_table
)

record_count = df.count()
print(f"Records to process: {record_count:,}")

if record_count == 0:
    print("No new records to process, exiting")
    dbutils.notebook.exit("0")

# Drop CDF metadata columns if present (from incremental read)
cdf_cols = ["_change_type", "_commit_version", "_commit_timestamp"]
for c in cdf_cols:
    if c in df.columns:
        df = df.drop(c)

df.printSchema()

## Transform: Extract Fields from MapType Columns

The bronze layer stores `properties` and `geometry` as MapType columns. We extract specific fields.

In [None]:
# Extract all fields from properties and geometry MapType columns
# Using element_at() or direct map access with ["key"]
df_typed = df.select(
    # Primary key
    col("id").alias("event_id"),
    
    # Magnitude info - extract from properties map
    col("properties")["mag"].cast(DoubleType()).alias("magnitude"),
    col("properties")["magType"].alias("magnitude_type"),
    
    # Location info
    col("properties")["place"].alias("place"),
    
    # Timing - USGS uses milliseconds since epoch
    from_unixtime(
        col("properties")["time"].cast(LongType()) / 1000
    ).cast(TimestampType()).alias("event_time"),
    from_unixtime(
        col("properties")["updated"].cast(LongType()) / 1000
    ).cast(TimestampType()).alias("updated_time"),
    col("properties")["tz"].cast(IntegerType()).alias("timezone_offset"),
    
    # Quality metrics
    col("properties")["rms"].cast(DoubleType()).alias("rms"),
    col("properties")["gap"].cast(DoubleType()).alias("gap"),
    col("properties")["dmin"].cast(DoubleType()).alias("dmin"),
    col("properties")["nst"].cast(IntegerType()).alias("station_count"),
    col("properties")["sig"].cast(IntegerType()).alias("significance"),
    
    # Impact metrics
    col("properties")["felt"].cast(IntegerType()).alias("felt_reports"),
    col("properties")["cdi"].cast(DoubleType()).alias("cdi"),
    col("properties")["mmi"].cast(DoubleType()).alias("mmi"),
    col("properties")["alert"].alias("alert_level"),
    col("properties")["tsunami"].cast(IntegerType()).alias("tsunami_flag"),
    
    # Source info
    col("properties")["net"].alias("network"),
    col("properties")["code"].alias("event_code"),
    col("properties")["status"].alias("review_status"),
    
    # URLs
    col("properties")["url"].alias("detail_url"),
    col("properties")["detail"].alias("api_detail_url"),
    
    # Coordinates from geometry map - coordinates is a string like "[-122.5, 37.8, 10.0]"
    # We need to parse the coordinates string
    col("geometry")["coordinates"].alias("coordinates_str"),
)

In [None]:
# Parse coordinates string [lon, lat, depth] 
# The coordinates come as a string representation of array
from pyspark.sql.functions import regexp_extract, split, expr

# Extract coordinates using regex or split
# Format: "[-122.5, 37.8, 10.0]" or similar
df_coords = df_typed.withColumn(
    "coords_clean",
    regexp_extract(col("coordinates_str"), r"\[([^\]]+)\]", 1)
).withColumn(
    "coords_array",
    split(col("coords_clean"), ",")
).withColumn(
    "longitude",
    trim(col("coords_array")[0]).cast(DoubleType())
).withColumn(
    "latitude", 
    trim(col("coords_array")[1]).cast(DoubleType())
).withColumn(
    "depth_km",
    trim(col("coords_array")[2]).cast(DoubleType())
).drop("coordinates_str", "coords_clean", "coords_array")

In [None]:
df_coords.printSchema()

## Enrich: Add Derived Columns

In [None]:
# Add magnitude category
df_enriched = df_coords.withColumn(
    "magnitude_category",
    when(col("magnitude") >= 8.0, "great")
    .when(col("magnitude") >= 7.0, "major")
    .when(col("magnitude") >= 6.0, "strong")
    .when(col("magnitude") >= 5.0, "moderate")
    .when(col("magnitude") >= 4.0, "light")
    .when(col("magnitude") >= 2.0, "minor")
    .otherwise("micro")
)

In [None]:
# Add depth category
df_enriched = df_enriched.withColumn(
    "depth_category",
    when(col("depth_km") < 70, "shallow")
    .when(col("depth_km") < 300, "intermediate")
    .otherwise("deep")
)

In [None]:
# Extract region from place string
# Place format: "10km NW of San Francisco, California" -> "San Francisco, California"
df_enriched = df_enriched.withColumn(
    "region",
    when(
        col("place").contains(" of "),
        trim(regexp_extract(col("place"), r" of (.+)$", 1))
    ).otherwise(col("place"))
)

In [None]:
# Add alert level numeric (for sorting/filtering)
df_enriched = df_enriched.withColumn(
    "alert_level_numeric",
    when(col("alert_level") == "red", 4)
    .when(col("alert_level") == "orange", 3)
    .when(col("alert_level") == "yellow", 2)
    .when(col("alert_level") == "green", 1)
    .otherwise(0)
)

In [None]:
# Add boolean flags
df_enriched = df_enriched \
    .withColumn("has_tsunami_warning", col("tsunami_flag") == 1) \
    .withColumn("is_reviewed", col("review_status") == "reviewed") \
    .withColumn("is_significant", col("significance") >= 500)

In [None]:
# Add processing timestamp
df_enriched = df_enriched.withColumn("_processed_at", current_timestamp())

## Data Quality: Filter and Deduplicate

In [None]:
# Filter out records with missing critical fields
df_clean = df_enriched.filter(
    col("event_id").isNotNull() &
    col("magnitude").isNotNull() &
    col("latitude").isNotNull() &
    col("longitude").isNotNull() &
    col("event_time").isNotNull()
)

print(f"Records after filtering: {df_clean.count():,}")

In [None]:
# Deduplicate by event_id, keeping the most recently updated
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

window = Window.partitionBy("event_id").orderBy(desc("updated_time"))

df_deduped = df_clean \
    .withColumn("_row_num", row_number().over(window)) \
    .filter(col("_row_num") == 1) \
    .drop("_row_num")

print(f"Records after deduplication: {df_deduped.count():,}")

In [None]:
# Preview enriched data
df_deduped.select(
    "event_id", "magnitude", "magnitude_category", 
    "place", "region", "latitude", "longitude",
    "depth_km", "depth_category", "event_time"
).show(10, truncate=False)

## Write to Silver Table

In [None]:
# Write to silver table (with CDF enabled for downstream gold processing)
record_count = write_delta_table_with_cdf(
    df=df_deduped,
    table_path=target_table,
    mode=write_mode,
    merge_keys=["event_id"],
    enable_cdf=True
)

# Save checkpoint so next run only processes new data
save_checkpoint(spark, checkpoint_table, source_table, source_version, record_count)

In [None]:
# Show table statistics
print_table_stats(spark, target_table)

## Verify Results

In [None]:
# Summary statistics
spark.sql(f"""
    SELECT 
        COUNT(*) as total_events,
        COUNT(DISTINCT region) as unique_regions,
        ROUND(AVG(magnitude), 2) as avg_magnitude,
        MAX(magnitude) as max_magnitude,
        SUM(CASE WHEN is_significant THEN 1 ELSE 0 END) as significant_events,
        SUM(CASE WHEN has_tsunami_warning THEN 1 ELSE 0 END) as tsunami_warnings
    FROM {target_table}
""").show()

In [None]:
# Distribution by magnitude category
spark.sql(f"""
    SELECT 
        magnitude_category,
        COUNT(*) as count,
        ROUND(AVG(significance), 0) as avg_significance
    FROM {target_table}
    GROUP BY magnitude_category
    ORDER BY avg_significance DESC
""").show()

In [None]:
# Return record count
dbutils.notebook.exit(str(record_count))