In [0]:
# =========================================================
# ================= GOLD ML-READY (STREAMING) =============
# =========================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, dayofmonth, col

In [0]:
# ================= CONFIG =================

silver_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"
gold_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/activos"
checkpoint_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/checkpoints/gold_activos"

In [0]:
# ================= SPARK =================

spark = (
    SparkSession.builder
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

#spark.conf.set("spark.sql.session.timeZone", "Europe/Madrid")

# ================= FOREACH BATCH =================

def process_batch(df, batch_id):
    try:
        print(f"[GOLD] Procesando batch_id={batch_id}")

        gold_df = (
            df
            .filter(col("timestamp").isNotNull())
            .filter(col("symbol").isNotNull())
            .filter(col("close").isNotNull())
            .filter(col("coste_opera_h").isNotNull())
            .withColumn("year", year(col("timestamp")))
            .withColumn("month", month(col("timestamp")))
            .withColumn("day", dayofmonth(col("timestamp")))
            .select(
                "timestamp",
                "symbol",
                "asset_class",
                "asset_name",
                "coste_opera_h",
                "close",
                "year",
                "month",
                "day"
            )
        )


        if not gold_df.take(1):
            print(f"[GOLD] batch_id={batch_id} vac√≠o")
            return


        (
            gold_df.write
            .format("delta")
            .mode("append")
            .partitionBy("year", "month", "day")
            .save(gold_path)
        )


        print(f"[GOLD] batch_id={batch_id} escrito correctamente")

    except Exception as e:
        print("🔥 ERROR REAL EN GOLD FOREACH BATCH 🔥")
        import traceback
        traceback.print_exc()
        raise e

# ================= STREAM =================

silver_stream = (
    spark.readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load(silver_path)
)

query = (
    silver_stream.writeStream
    .foreachBatch(process_batch)
    .option("checkpointLocation", checkpoint_path)
    .start()
)

query.awaitTermination()

[GOLD] Procesando batch_id=0
[GOLD] batch_id=0 escrito correctamente
[GOLD] Procesando batch_id=1
[GOLD] batch_id=1 escrito correctamente
[GOLD] Procesando batch_id=2
[GOLD] batch_id=2 escrito correctamente
[GOLD] Procesando batch_id=3
[GOLD] batch_id=3 escrito correctamente
[GOLD] Procesando batch_id=4
[GOLD] batch_id=4 escrito correctamente
[GOLD] Procesando batch_id=5
[GOLD] batch_id=5 escrito correctamente
[GOLD] Procesando batch_id=6
[GOLD] batch_id=6 escrito correctamente
[GOLD] Procesando batch_id=7
[GOLD] batch_id=7 escrito correctamente
[GOLD] Procesando batch_id=8
[GOLD] batch_id=8 escrito correctamente
[GOLD] Procesando batch_id=9
[GOLD] batch_id=9 escrito correctamente
[GOLD] Procesando batch_id=10
[GOLD] batch_id=10 escrito correctamente
[GOLD] Procesando batch_id=11
[GOLD] batch_id=11 escrito correctamente
[GOLD] Procesando batch_id=12
[GOLD] batch_id=12 escrito correctamente
[GOLD] Procesando batch_id=13
[GOLD] batch_id=13 escrito correctamente
[GOLD] Procesando batch_id

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
%skip
dbutils.fs.rm(
    "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/activos",
    True
)

dbutils.fs.rm(
    "abfss://datos@mastertfm001sta.dfs.core.windows.net/checkpoints/gold_activos",
    True
)

In [0]:
# (
#     spark.read
#     .format("delta")
#     .load(silver_path)
#     .limit(0)
#     .filter(col("timestamp").isNotNull())
#     .filter(col("symbol").isNotNull())
#     .filter(col("close").isNotNull())
#     .filter(col("coste_opera_h").isNotNull())
#     .withColumn("year", year(col("timestamp")))
#     .withColumn("month", month(col("timestamp")))
#     .withColumn("day", dayofmonth(col("timestamp")))
#     .select(
#         "timestamp",
#         "symbol",
#         "asset_class",
#         "asset_name",
#         "coste_opera_h",
#         "close",
#         "year",
#         "month",
#         "day"
#     )
#     .write
#     .format("delta")
#     .mode("overwrite")
#     .partitionBy("year", "month", "day")
#     .save(gold_path)
# )


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data