###Parámetros y referencias

In [0]:
# [02][1] Parámetros y referencias (consistentes con 01)
dbutils.widgets.text("CATALOGO",        "workspace")
dbutils.widgets.text("ESQUEMA_BRONCE",  "bronze_mb")
dbutils.widgets.text("ESQUEMA_PLATA",   "silver_mb")
dbutils.widgets.text("PREFIJO_TABLA",   "mb_")
dbutils.widgets.text("VOLUMEN_CATALOGO","workspace")
dbutils.widgets.text("VOLUMEN_ESQUEMA", "default")
dbutils.widgets.text("VOLUMEN_LANDING", "landing")
dbutils.widgets.text("WATERMARK_MIN",   "0")                  # 0 = sin watermark
dbutils.widgets.dropdown("QUARANTINE",  "true", ["true","false"])
dbutils.widgets.dropdown("MODO_TRIGGER","once", ["once","availableNow"])

CATALOGO        = dbutils.widgets.get("CATALOGO")
ESQ_BRONCE      = dbutils.widgets.get("ESQUEMA_BRONCE")
ESQ_PLATA       = dbutils.widgets.get("ESQUEMA_PLATA")
PREFIJO_TABLA   = dbutils.widgets.get("PREFIJO_TABLA")
V_CAT           = dbutils.widgets.get("VOLUMEN_CATALOGO")
V_ESQ           = dbutils.widgets.get("VOLUMEN_ESQUEMA")
V_LAND          = dbutils.widgets.get("VOLUMEN_LANDING")
WATERMARK_MIN   = int(dbutils.widgets.get("WATERMARK_MIN"))
USE_QUAR        = dbutils.widgets.get("QUARANTINE") == "true"
MODO_TRIGGER    = dbutils.widgets.get("MODO_TRIGGER")

T_BRONCE = f"{CATALOGO}.{ESQ_BRONCE}.{PREFIJO_TABLA}events_bronce"
T_PLATA  = f"{CATALOGO}.{ESQ_PLATA}.{PREFIJO_TABLA}events_silver"
T_QUAR   = f"{CATALOGO}.{ESQ_PLATA}.{PREFIJO_TABLA}events_quarantine"
CHK_PLATA= f"/Volumes/{V_CAT}/{V_ESQ}/{V_LAND}/_chk_plata"

display({"T_BRONCE": T_BRONCE, "T_PLATA": T_PLATA, "T_QUAR": T_QUAR, "CHK_PLATA": CHK_PLATA})


###Transformaciones: cast y validación

In [0]:
# [02][2] Transformaciones de calidad:
# - Convierto timestamp → ts (TIMESTAMP), price → DOUBLE
# - Defino regla de validez mínima y separo válidos/invalidos (quarantine opcional)

from pyspark.sql.functions import col, to_timestamp, when, current_timestamp

src = (spark.readStream.table(T_BRONCE)
       .select("timestamp","price","user_id","_ingest_ts","_source_file","_batch_id"))

df = (src
      .withColumn("ts", to_timestamp(col("timestamp")))
      .withColumn("price_d", col("price").cast("double"))
      .withColumn("user_id_s", col("user_id").cast("string")))

valido = (col("ts").isNotNull() & col("price_d").isNotNull() & (col("price_d")>0) & col("user_id_s").isNotNull())

df_validos = (df.where(valido)
                .select(col("ts").alias("ts"),
                        col("price_d").alias("price"),
                        col("user_id_s").alias("user_id"),
                        "_ingest_ts","_source_file","_batch_id"))

df_invalidos = (df.where(~valido)
                  .select(col("timestamp").alias("timestamp_raw"),
                          col("price").alias("price_raw"),
                          col("user_id").alias("user_id_raw"),
                          when(col("ts").isNull(), "ts_invalid")
                           .when(col("price_d").isNull() | (col("price_d")<=0), "price_invalid")
                           .when(col("user_id_s").isNull(), "user_missing")
                           .otherwise("unknown").alias("reason"),
                          "_ingest_ts","_source_file","_batch_id"))

if WATERMARK_MIN > 0:
    df_validos = df_validos.withWatermark("ts", f"{WATERMARK_MIN} minutes")


###Escrituras CE-safe (trigger once / availableNow)

In [0]:
# [02][3] Escrituras CE-safe: proceso lo pendiente y termino
# Nota: en Community no hay ProcessingTime. Uso once o availableNow según widget.

writer_ok = (df_validos.writeStream
             .format("delta")
             .option("checkpointLocation", CHK_PLATA)
             .outputMode("append"))

if MODO_TRIGGER == "availableNow":
    q_ok = writer_ok.trigger(availableNow=True).toTable(T_PLATA)
else:
    q_ok = writer_ok.trigger(once=True).toTable(T_PLATA)

q_ok.awaitTermination()

if USE_QUAR:
    writer_bad = (df_invalidos.writeStream
                  .format("delta")
                  .option("checkpointLocation", CHK_PLATA + "_quar")
                  .outputMode("append"))
    if MODO_TRIGGER == "availableNow":
        q_bad = writer_bad.trigger(availableNow=True).toTable(T_QUAR)
    else:
        q_bad = writer_bad.trigger(once=True).toTable(T_QUAR)
    q_bad.awaitTermination()

print("[INFO] Carga a Plata completada (y Quarantine si está habilitado).")
