In [0]:
# =========================================================
# ================= GOLD ML-READY (STREAMING) =============
# =========================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import year, month, dayofmonth

In [0]:
# ================= CONFIG =================

silver_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"
gold_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/activos"
checkpoint_path = gold_path + "/_checkpoint"

# ================= SPARK =================

spark = (
    SparkSession.builder
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

spark.conf.set("spark.sql.session.timeZone", "Europe/Madrid")

# ================= FOREACH BATCH =================

def process_batch(df, batch_id):
    try:
        print(f"[GOLD] Procesando batch_id={batch_id}")

        gold_df = (
            df
            .filter(col("timestamp").isNotNull())
            .filter(col("symbol").isNotNull())
            .filter(col("close").isNotNull())
            .filter(col("coste_opera_h").isNotNull())
            .withColumn("year", year(col("timestamp")))
            .withColumn("month", month(col("timestamp")))
            .withColumn("day", dayofmonth(col("timestamp")))
            .select(
                "timestamp",
                "symbol",
                "asset_class",
                "asset_name",
                "coste_opera_h",
                "close",
                "year",
                "month",
                "day"
            )
        )


        if gold_df.rdd.isEmpty():
            print(f"[GOLD] batch_id={batch_id} vac√≠o")
            return

        (
            gold_df.write
            .format("delta")
            .mode("append")
            .partitionBy("year", "month", "day")
            .save(gold_path)
        )


        print(f"[GOLD] batch_id={batch_id} escrito correctamente")

    except Exception as e:
        print("🔥 ERROR REAL EN GOLD FOREACH BATCH 🔥")
        import traceback
        traceback.print_exc()
        raise e

# ================= STREAM =================

silver_stream = (
    spark.readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load(silver_path)
)

query = (
    silver_stream.writeStream
    .foreachBatch(process_batch)
    .option("checkpointLocation", checkpoint_path)
    .start()
)

query.awaitTermination()

[GOLD] Procesando batch_id=628
[GOLD] batch_id=628 escrito correctamente
[GOLD] Procesando batch_id=629
[GOLD] batch_id=629 escrito correctamente
[GOLD] Procesando batch_id=630
[GOLD] batch_id=630 escrito correctamente
[GOLD] Procesando batch_id=631
[GOLD] batch_id=631 escrito correctamente
[GOLD] Procesando batch_id=632
[GOLD] batch_id=632 escrito correctamente
[GOLD] Procesando batch_id=633
[GOLD] batch_id=633 escrito correctamente
[GOLD] Procesando batch_id=634
[GOLD] batch_id=634 escrito correctamente
[GOLD] Procesando batch_id=635
[GOLD] batch_id=635 escrito correctamente
[GOLD] Procesando batch_id=636
[GOLD] batch_id=636 escrito correctamente
[GOLD] Procesando batch_id=637
[GOLD] batch_id=637 escrito correctamente
[GOLD] Procesando batch_id=638
[GOLD] batch_id=638 escrito correctamente
[GOLD] Procesando batch_id=639
[GOLD] batch_id=639 escrito correctamente
[GOLD] Procesando batch_id=640
[GOLD] batch_id=640 escrito correctamente
[GOLD] Procesando batch_id=641
[GOLD] batch_id=641 

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# output_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/export/gold_parquet"

# df_gold_single.write \
#     .mode("overwrite") \
#     .parquet(output_path)


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, year, month, dayofmonth

# silver_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"
# gold_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/activos"

# spark = (
#     SparkSession.builder
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
#     .getOrCreate()
# )

# spark.conf.set("spark.sql.session.timeZone", "Europe/Madrid")

# print("📥 Cargando hist√≥rico completo desde SILVER")

# silver_df = spark.read.format("delta").load(silver_path)

# gold_df = (
#     silver_df
#     .filter(col("timestamp").isNotNull())
#     .filter(col("symbol").isNotNull())
#     .filter(col("close").isNotNull())
#     .filter(col("coste_opera_h").isNotNull())
#     .withColumn("year", year(col("timestamp")))
#     .withColumn("month", month(col("timestamp")))
#     .withColumn("day", dayofmonth(col("timestamp")))
#     .select(
#         "timestamp",
#         "symbol",
#         "asset_class",
#         "asset_name",
#         "coste_opera_h",
#         "close",
#         "year",
#         "month",
#         "day"
#     )
# )

# print("💾 Escribiendo hist√≥rico completo en GOLD particionado")

# (
#     gold_df.write
#     .format("delta")
#     .mode("overwrite")
#     .partitionBy("year", "month", "day")
#     .save(gold_path)
# )

# print("‚úÖ Backfill GOLD completado correctamente")


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data