###Par치metros y referencias

In [0]:
# [02][1] Par치metros y referencias checkpoint ligado al ID de la tabla Bronce
dbutils.widgets.text("CATALOGO",        "workspace")
dbutils.widgets.text("ESQUEMA_BRONCE",  "bronze_mb")
dbutils.widgets.text("ESQUEMA_PLATA",   "silver_mb")
dbutils.widgets.text("PREFIJO_TABLA",   "mb_")
dbutils.widgets.text("VOLUMEN_CATALOGO","workspace")
dbutils.widgets.text("VOLUMEN_ESQUEMA", "default")
dbutils.widgets.text("VOLUMEN_LANDING", "landing")
dbutils.widgets.text("WATERMARK_MIN",   "0")
dbutils.widgets.dropdown("QUARANTINE",  "true", ["true","false"])
dbutils.widgets.dropdown("MODO_TRIGGER","once", ["once","availableNow"])

CATALOGO        = dbutils.widgets.get("CATALOGO")
ESQ_BRONCE      = dbutils.widgets.get("ESQUEMA_BRONCE")
ESQ_PLATA       = dbutils.widgets.get("ESQUEMA_PLATA")
PREFIJO_TABLA   = dbutils.widgets.get("PREFIJO_TABLA")
V_CAT           = dbutils.widgets.get("VOLUMEN_CATALOGO")
V_ESQ           = dbutils.widgets.get("VOLUMEN_ESQUEMA")
V_LAND          = dbutils.widgets.get("VOLUMEN_LANDING")
WATERMARK_MIN   = int(dbutils.widgets.get("WATERMARK_MIN"))
USE_QUAR        = dbutils.widgets.get("QUARANTINE") == "true"
MODO_TRIGGER    = dbutils.widgets.get("MODO_TRIGGER")

T_BRONCE = f"{CATALOGO}.{ESQ_BRONCE}.{PREFIJO_TABLA}events_bronce"
T_PLATA  = f"{CATALOGO}.{ESQ_PLATA}.{PREFIJO_TABLA}events_silver"
T_QUAR   = f"{CATALOGO}.{ESQ_PLATA}.{PREFIJO_TABLA}events_quarantine"

# checkpoint versionado por ID de la tabla Bronce ===
bronze_id = spark.sql(f"DESCRIBE DETAIL {T_BRONCE}").select("id").first()["id"]
CHK_BASE  = f"/Volumes/{V_CAT}/{V_ESQ}/{V_LAND}"
CHK_PLATA = f"{CHK_BASE}/_chk_plata_{bronze_id}"

display({
  "T_BRONCE": T_BRONCE, "T_PLATA": T_PLATA, "T_QUAR": T_QUAR,
  "CHK_PLATA": CHK_PLATA
})


###Transformaciones: cast y validaci칩n

In [0]:
# [02][2] Transformaciones robustas
from pyspark.sql.functions import col, expr, coalesce, when, split, size, trim

src = (spark.readStream.table(T_BRONCE)
       .select("timestamp","price","user_id","_ingest_ts","_source_file","_batch_id"))

# --- Reparaci칩n opcional de filas "colapsadas" si el delimitador de 01 fue incorrecto ---
collapsed = (col("price").isNull() & col("user_id").isNull() & col("timestamp").rlike(r".*[;,].*"))
parts_c  = split(col("timestamp"), ",")
parts_sc = split(col("timestamp"), ";")

ts_fix = when(collapsed & (size(parts_c)  >= 1), parts_c.getItem(0)) \
         .when(collapsed & (size(parts_sc) >= 1), parts_sc.getItem(0)) \
         .otherwise(col("timestamp"))

pr_fix = when(collapsed & (size(parts_c)  >= 2), parts_c.getItem(1)) \
         .when(collapsed & (size(parts_sc) >= 2), parts_sc.getItem(1)) \
         .otherwise(col("price"))

us_fix = when(collapsed & (size(parts_c)  >= 3), parts_c.getItem(2)) \
         .when(collapsed & (size(parts_sc) >= 3), parts_sc.getItem(2)) \
         .otherwise(col("user_id"))

df0 = (src
       .withColumn("timestamp_fix", trim(ts_fix))
       .withColumn("price_fix",     trim(pr_fix))
       .withColumn("user_fix",      trim(us_fix))
      )


# formatos M/d/yyyy y d/M/yyyy con/sin hora, y variantes YYYY-MM-DD.
ts_parsed = coalesce(
    expr("try_to_timestamp(timestamp_fix, 'M/d/yyyy H:m:s')"),
    expr("try_to_timestamp(timestamp_fix, 'M/d/yyyy H:m')"),
    expr("try_to_timestamp(timestamp_fix, 'M/d/yyyy')"),
    expr("try_to_timestamp(timestamp_fix, 'd/M/yyyy H:m:s')"),
    expr("try_to_timestamp(timestamp_fix, 'd/M/yyyy H:m')"),
    expr("try_to_timestamp(timestamp_fix, 'd/M/yyyy')"),
    expr("try_to_timestamp(timestamp_fix, 'yyyy-MM-dd HH:mm:ss')"),
    expr("try_to_timestamp(timestamp_fix, 'yyyy-MM-dd')"),
    expr("try_to_timestamp(timestamp_fix)")
)

# Precio
price_d = expr("try_cast(price_fix AS DOUBLE)")

df = df0.select(
    ts_parsed.alias("ts"),
    price_d.alias("price"),
    col("user_fix").cast("string").alias("user_id"),
    "_ingest_ts","_source_file","_batch_id",
    col("timestamp").alias("timestamp_raw"),
    col("price").alias("price_raw"),
    col("user_id").alias("user_id_raw")
)

# Reglas de validez
es_valido = (col("ts").isNotNull() & col("price").isNotNull() & (col("price") > 0) & col("user_id").isNotNull())

df_validos = df.where(es_valido).select("ts","price","user_id","_ingest_ts","_source_file","_batch_id")
df_invalidos = (df.where(~es_valido)
                  .select(
                      "timestamp_raw","price_raw","user_id_raw",
                      when(col("ts").isNull(), "ts_invalid")
                       .when(col("price").isNull() | (col("price") <= 0), "price_invalid")
                       .when(col("user_id").isNull(), "user_missing")
                       .otherwise("unknown").alias("reason"),
                      "_ingest_ts","_source_file","_batch_id"
                  ))

# watermark
if WATERMARK_MIN > 0:
    df_validos = df_validos.withWatermark("ts", f"{WATERMARK_MIN} minutes")


###Escrituras CE-safe

In [0]:
# [02][3] Escrituras CE-safe: proceso lo pendiente y termino

writer_ok = (df_validos.writeStream
             .format("delta")
             .option("checkpointLocation", CHK_PLATA)
             .outputMode("append"))

if MODO_TRIGGER == "availableNow":
    q_ok = writer_ok.trigger(availableNow=True).toTable(T_PLATA)
else:
    q_ok = writer_ok.trigger(once=True).toTable(T_PLATA)

q_ok.awaitTermination()

if USE_QUAR:
    writer_bad = (df_invalidos.writeStream
                  .format("delta")
                  .option("checkpointLocation", CHK_PLATA + "_quar")
                  .outputMode("append"))
    if MODO_TRIGGER == "availableNow":
        q_bad = writer_bad.trigger(availableNow=True).toTable(T_QUAR)
    else:
        q_bad = writer_bad.trigger(once=True).toTable(T_QUAR)
    q_bad.awaitTermination()

print("[INFO] Carga a Plata completada")
