In [0]:
# ====================================================================
# ================= SILVER OHLCV 1m (STATEFUL STREAMING) ==============
# ====================================================================

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lag, when, log
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType, StructField,
    TimestampType, StringType,
    DoubleType, IntegerType
)
import math

In [0]:
# =========================================================
# ================= SILVER OHLCV 1m (STREAMING) ===========
# =========================================================

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import (
    col, when, from_utc_timestamp, regexp_replace
)

# ================= CONFIG =================

bronze_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/bronze/activos"
silver_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"
checkpoint_path = silver_path + "/_checkpoint"

# ================= SPARK =================

spark = (
    SparkSession.builder
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

spark.conf.set("spark.sql.session.timeZone", "UTC")

# ================= DIMENSIÓN DE ACTIVOS =================
# symbol | asset_class | asset_name

asset_rows = [
    # Acciones
    Row("TSLA","Acciones","Tesla"),
    Row("NVDA","Acciones","Nvidia"),
    Row("AMD","Acciones","AMD"),
    Row("COIN","Acciones","Coinbase"),
    Row("PLTR","Acciones","Palantir"),
    Row("RIVN","Acciones","Rivian"),
    Row("SHOP","Acciones","Shopify"),
    Row("LCID","Acciones","Lucid Motors"),
    Row("ZM","Acciones","Zoom"),
    Row("SPCE","Acciones","Virgin Galactic"),
    Row("KO","Acciones","Coca Cola Company"),
    Row("PG","Acciones","Procter & Gamble"),
    Row("JNJ","Acciones","Johnson & Johnson"),
    Row("PEP","Acciones","PepsiCo"),
    Row("WMT","Acciones","Walmart"),
    Row("MCD","Acciones","McDonalds"),
    Row("VZ","Acciones","Verizon"),
    Row("DUK","Acciones","Duke Energy"),
    Row("UL","Acciones","Unilever"),
    Row("V","Acciones","VISA"),

    # Fondos
    Row("SPY","Fondos","SPDR S&P 500 ETF Trust"),
    Row("QQQ","Fondos","Invesco QQQ Trust"),
    Row("EEM","Fondos","iShares MSCI Emerging Markets ETF"),
    Row("VGK","Fondos","Vanguard FTSE Europe ETF"),
    Row("AGG","Fondos","iShares Core U.S. Aggregate Bond ETF"),
    Row("VNQ","Fondos","Vanguard Real Estate ETF"),
    Row("ARKK","Fondos","ARK Innovation ETF"),
    Row("VUG","Fondos","Vanguard Growth ETF"),
    Row("SCHD","Fondos","Schwab US Dividend Equity ETF"),
    Row("SOXX","Fondos","iShares Semiconductor ETF"),

    # Forex
    Row("EURUSD","Forex","Euro / Dólar"),
    Row("USDJPY","Forex","Dólar / Yen"),
    Row("GBPUSD","Forex","Libra / Dólar"),
    Row("USDCHF","Forex","Dólar / Franco Suizo"),
    Row("AUDUSD","Forex","Dólar Australiano / Dólar"),
    Row("USDCAD","Forex","Dólar / Dólar Canadiense"),
    Row("NZDUSD","Forex","Dólar Nueva Zelanda / Dólar"),
    Row("EURGBP","Forex","Euro / Libra"),
    Row("EURJPY","Forex","Euro / Yen"),
    Row("GBPJPY","Forex","Libra / Yen"),

    # Cripto
    Row("BTC-USD","Cripto","Bitcoin"),
    Row("ETH-USD","Cripto","Ethereum"),
    Row("BNB-USD","Cripto","Binance Coin"),
    Row("XRP-USD","Cripto","Ripple"),
    Row("SOL-USD","Cripto","Solana"),
    Row("TRX-USD","Cripto","Tron"),
    Row("DOGE-USD","Cripto","Dogecoin"),
    Row("ADA-USD","Cripto","Cardano"),
    Row("AVAX-USD","Cripto","Avalanche"),
    Row("LTC-USD","Cripto","Litecoin"),


    # Commodities
    Row("GLD","Commodities","Oro"),
    Row("SLV","Commodities","Plata"),
    Row("PPLT","Commodities","Platino"),
    Row("PALL","Commodities","Paladio"),
    Row("USO","Commodities","Petróleo"),
    Row("UNG","Commodities","Gas Natural"),
    Row("CORN","Commodities","Maiz"),
    Row("SOYB","Commodities","Soja"),
    Row("WEAT","Commodities","Trigo"),
    Row("CANE","Commodities","Azucar"),
]

asset_dim = spark.createDataFrame(
    asset_rows,
    ["symbol", "asset_class", "asset_name"]
)

# ================= FOREACH BATCH =================

def process_batch(df, batch_id):
    try:
        print(f"[SILVER] Procesando batch_id={batch_id}")

        df_clean = (
            df
            .dropDuplicates(["symbol", "timestamp"])
            .filter(col("timestamp").isNotNull())
            .filter(col("close").isNotNull())
            .filter(col("symbol").isNotNull())
            .filter(col("source").isNotNull())
        )

        # 🔹 LIMPIAR =X EN FOREX
        df_clean = df_clean.withColumn(
            "symbol",
            regexp_replace(col("symbol"), "=X$", "")
        )

        # 🔹 TIMESTAMP A EUROPE/MADRID
        df_clean = df_clean.withColumn(
            "timestamp",
            from_utc_timestamp(col("timestamp"), "Europe/Madrid")
        )

        if df_clean.rdd.isEmpty():
            print(f"[SILVER] batch_id={batch_id} vacío")
            return

        # 🔹 JOIN CON DIMENSIÓN DE ACTIVOS
        silver_df = (
            df_clean
            .join(asset_dim, on="symbol", how="left")
            .withColumn(
                "coste_opera_h",
                when(col("asset_class") == "Acciones", 0.01)
                .when(col("asset_class") == "Fondos", 0.0001)
                .when(col("asset_class") == "Forex", 0.0002)
                .when(col("asset_class") == "Cripto", 0.01)
                .when(col("asset_class") == "Commodities", 0.01)
                .otherwise(None)
            )
            .select(
                "timestamp",
                "symbol",
                "asset_class",
                "asset_name",
                "coste_opera_h",
                "open",
                "high",
                "low",
                "close",
                "volume",
                "timezone",
                "source",
                "year",
                "month",
                "day",
            )
        )


        silver_df.write \
            .format("delta") \
            .mode("append") \
            .partitionBy("year", "month", "day") \
            .save(silver_path)

        print(f"[SILVER] batch_id={batch_id} escrito correctamente")

    except Exception as e:
        print("🔥 ERROR REAL EN FOREACH BATCH 🔥")
        import traceback
        traceback.print_exc()
        raise e

# ================= STREAM =================

bronze_stream = (
    spark.readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load(bronze_path)
)

query = (
    bronze_stream.writeStream
    .foreachBatch(process_batch)
    .option("checkpointLocation", checkpoint_path)
    .start()
)

query.awaitTermination()



[SILVER] Procesando batch_id=712
[SILVER] batch_id=712 escrito correctamente
[SILVER] Procesando batch_id=713
[SILVER] batch_id=713 escrito correctamente
[SILVER] Procesando batch_id=714
[SILVER] batch_id=714 escrito correctamente
[SILVER] Procesando batch_id=715
[SILVER] batch_id=715 escrito correctamente
[SILVER] Procesando batch_id=716
[SILVER] batch_id=716 escrito correctamente
[SILVER] Procesando batch_id=717
[SILVER] batch_id=717 escrito correctamente
[SILVER] Procesando batch_id=718
[SILVER] batch_id=718 escrito correctamente
[SILVER] Procesando batch_id=719
[SILVER] batch_id=719 escrito correctamente
[SILVER] Procesando batch_id=720
[SILVER] batch_id=720 escrito correctamente
[SILVER] Procesando batch_id=721
[SILVER] batch_id=721 escrito correctamente
[SILVER] Procesando batch_id=722
[SILVER] batch_id=722 escrito correctamente
[SILVER] Procesando batch_id=723
[SILVER] batch_id=723 escrito correctamente
[SILVER] Procesando batch_id=724
[SILVER] batch_id=724 escrito correctamente

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# silver_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# df_silver = spark.read.format("delta").load(silver_path)
# #df_silver.show(20, truncate=False)
# display(df_silver)

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# spark.sql(f"""
# ALTER TABLE delta.`{silver_path}`
# ADD COLUMNS (coste_opera_h DOUBLE)
# """)

# spark.sql(f"""
# UPDATE delta.`{silver_path}`
# SET coste_opera_h =
#   CASE
#     WHEN asset_class = 'Acciones' THEN 0.01
#     WHEN asset_class = 'Fondos' THEN 0.0001
#     WHEN asset_class = 'Forex' THEN 0.0002
#     WHEN asset_class = 'Cripto' THEN 0.01
#     WHEN asset_class = 'Commodities' THEN 0.01
#     ELSE NULL
#   END
# WHERE coste_opera_h IS NULL
# """)


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# spark.read.format("delta") \
#     .load(silver_path) \
#     .select("asset_class", "coste_opera_h") \
#     .groupBy("asset_class", "coste_opera_h") \
#     .count() \
#     .display()


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# display(df_silver.select("asset_class").distinct())

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# from pyspark.sql.functions import col

# df_null_names = (
#     df_silver
#     .filter(col("asset_class").isNull())
# )

# df_null_names.show(truncate=False)


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# dbutils.fs.rm(silver_path, recurse=True)
# dbutils.fs.rm(checkpoint_path, recurse=True)


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# from delta.tables import DeltaTable
# from pyspark.sql.functions import col

# silver_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"

# silver_table = DeltaTable.forPath(spark, silver_path)

# silver_table.update(
#     condition=col("symbol") == "LTC-USD",
#     set={
#         "asset_class": "'Cripto'",
#         "asset_name": "'Litecoin'"
#     }
# )



com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# from delta.tables import DeltaTable

# delta_table = DeltaTable.forPath(
#     spark,
#     "abfss://datos@mastertfm001sta.dfs.core.windows.net/silver/activos"
# )

# delta_table.delete("year = '2026'")

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
# df_silver.columns

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data