In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, broadcast

In [0]:
gold_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/activos"

# Pon tu csv aquí. Recomendado guardarlo en ADLS también:
# por ejemplo: abfss://datos@.../gold/config/estrategias_optimas.csv
estrategias_csv_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/config/estrategias_optimas_vf.csv"

alerts_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/alerts"
checkpoint_path = alerts_path + "/_checkpoint_inference"


In [0]:
spark = (
    SparkSession.builder
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

spark.conf.set("spark.sql.session.timeZone", "Europe/Madrid")


In [0]:
estrategias_df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(estrategias_csv_path)
)

# Opción A, filtrar por pasa_filtro si existe
if "pasa_filtro" in estrategias_df.columns:
    estrategias_apto = estrategias_df.filter(col("pasa_filtro") == True)
else:
    # Opción B, filtrar por clasificacion
    estrategias_apto = estrategias_df.filter(col("clasificacion").isin(["APTO FUERTE 🚀", "APTO MEDIO ✅", "APTO DÉBIL ⚠️"]))

estrategias_apto = estrategias_apto.select(
    "asset_name", "best_x", "best_k", "best_n", "best_j"
)

display(estrategias_apto)


asset_name,best_x,best_k,best_n,best_j
PLTR,37,3,16,0.0004
SPCE,47,2,19,0.002
ZM,33,3,18,0.0015
SOXX,51,2,19,0.0002
LCID,53,6,16,0.0
ARKK,29,2,19,0.0002
COIN,36,3,19,0.002
ADA-USD,47,7,9,0.0015
DOGE-USD,32,2,18,0.002
AMD,46,3,16,0.002


In [0]:
gold_stream = (
    spark.readStream
    .format("delta")
    .load(gold_path)
)

# gold tiene: timestamp, symbol, asset_class, asset_name, coste_opera_h, close
stream_joined = (
    gold_stream
    .select("timestamp", "symbol", "asset_name", "close")
    .join(broadcast(estrategias_apto), on="asset_name", how="inner")
)


In [0]:
from collections import deque, defaultdict
import pandas as pd
import numpy as np

# Buffers y control de spam en DRIVER
BUFFERS = defaultdict(lambda: deque(maxlen=10_000))
LAST_ALERT_BIN = {}

# Convertir estrategias_apto (Spark DF) a dict en driver
# Asumimos pocos activos (decenas)
STRATEGIES = {
    r["asset_name"]: {
        "best_x": int(r["best_x"]),
        "best_k": int(r["best_k"]),
        "best_n": int(r["best_n"]),
        "best_j": float(r["best_j"])
    }
    for r in estrategias_apto.collect()
}

print(f"✅ Estrategias cargadas: {len(STRATEGIES)} activos")



✅ Estrategias cargadas: 20 activos


In [0]:
# ============================
# WARM-UP HISTÓRICO DESDE GOLD
# ============================

from datetime import timedelta
from pyspark.sql.functions import col, max as spark_max

print("🔥 Iniciando warm-up histórico desde GOLD")

# 1️⃣ Calcular minutos necesarios (peor estrategia)
estrategias_pdf = estrategias_apto.toPandas()
estrategias_pdf["min_needed"] = (
    (estrategias_pdf["best_k"] + estrategias_pdf["best_n"] + 5)
    * estrategias_pdf["best_x"]
)

max_minutes_needed = int(estrategias_pdf["min_needed"].max())
print(f"⏱️ Ventana histórica necesaria: {max_minutes_needed} minutos")

# 2️⃣ Leer GOLD histórico
gold_hist = (
    spark.read
    .format("delta")
    .load(gold_path)
    .filter(col("asset_name").isin(estrategias_pdf["asset_name"].tolist()))
)

# 3️⃣ Determinar ventana temporal
max_ts = gold_hist.select(spark_max("timestamp")).collect()[0][0]
min_ts = max_ts - timedelta(minutes=max_minutes_needed)

gold_hist = gold_hist.filter(col("timestamp") >= min_ts)

# 4️⃣ Pasar a pandas y cargar buffers
pdf_hist = (
    gold_hist
    .select("timestamp", "asset_name", "close")
    .orderBy("asset_name", "timestamp")
    .toPandas()
)

for asset, g in pdf_hist.groupby("asset_name"):
    row = estrategias_pdf[estrategias_pdf["asset_name"] == asset].iloc[0]

    min_raw_needed = int(
        (row["best_k"] + row["best_n"] + 5) * row["best_x"]
    )

    BUFFERS[asset] = deque(maxlen=min_raw_needed * 2)

    for _, r in g.iterrows():
        BUFFERS[asset].append((r["timestamp"], float(r["close"])))

    print(f"✅ Warm-up {asset}: {len(BUFFERS[asset])} puntos cargados")

print("🔥 Warm-up completado")


🔥 Iniciando warm-up histórico desde GOLD
⏱️ Ventana histórica necesaria: 1566 minutos
✅ Warm-up AMD: 968 puntos cargados
🔥 Warm-up completado


In [0]:
import traceback
from pyspark.sql.functions import col
from pyspark.sql import Row

def process_inference_batch(batch_df, batch_id: int):
    try:
        nrows = batch_df.count()
        print(f"[INFERENCE] batch_id={batch_id} rows={nrows}")

        if batch_df.rdd.isEmpty():
            print(f"[INFERENCE] batch_id={batch_id} vacío")
            return

        # 1) Filtrar SOLO activos aptos (presentes en STRATEGIES)
        sdf = (
            batch_df
            .select("timestamp", "asset_name", "symbol", "close")
            .filter(col("asset_name").isin(list(STRATEGIES.keys())))
        )

        if sdf.rdd.isEmpty():
            print(f"[INFERENCE] batch_id={batch_id} sin activos con estrategia")
            return

        # 2) A pandas lo mínimo, ya filtrado
        pdf = sdf.toPandas()
        pdf["timestamp"] = pd.to_datetime(pdf["timestamp"])
        pdf = pdf.sort_values(["asset_name", "timestamp"])

        alerts_out = []

        # 3) Loop por activo
        for asset, g in pdf.groupby("asset_name"):

            if asset not in STRATEGIES:
                continue

            params = STRATEGIES[asset]
            best_x = int(params["best_x"])
            best_k = int(params["best_k"])
            best_n = int(params["best_n"])
            best_j = float(params["best_j"])

            symbol = g["symbol"].iloc[0]

            # Necesidad mínima de datos brutos
            min_raw_needed = (best_k + best_n + 5) * best_x

            # Inicializar / ajustar buffer
            if asset not in BUFFERS:
                BUFFERS[asset] = deque(maxlen=min_raw_needed * 2)

            if BUFFERS[asset].maxlen < min_raw_needed * 2:
                BUFFERS[asset] = deque(BUFFERS[asset], maxlen=min_raw_needed * 2)

            # Añadir al buffer
            for _, row in g.iterrows():
                BUFFERS[asset].append((row["timestamp"], float(row["close"])))

            if len(BUFFERS[asset]) < min_raw_needed:
                continue

            # Serie y resample
            buf = list(BUFFERS[asset])
            s = pd.Series(
                data=[x[1] for x in buf],
                index=pd.DatetimeIndex([x[0] for x in buf])
            ).sort_index()

            rs = s.resample(f"{best_x}min").last().dropna()

            if len(rs) < (best_k + best_n + 2):
                continue

            prices = rs.values.astype(float)

            # Bin actual para anti spam
            bin_start = rs.index[-1].to_pydatetime()
            if LAST_ALERT_BIN.get(asset) == bin_start:
                continue

            # 4) Señal
            # IMPORTANTE: esta función debe existir en tu notebook
            signal, _ = compute_signal_from_resampled(prices, best_k, best_n, best_j)

            if signal is None:
                continue

            LAST_ALERT_BIN[asset] = bin_start

            alerts_out.append(Row(
                timestamp_alert=rs.index[-1].to_pydatetime(),
                asset_name=asset,
                symbol=symbol,
                best_x=best_x,
                best_k=best_k,
                best_n=best_n,
                best_j=best_j,
                signal=signal,
                price=float(prices[-1]),
                bin_start=bin_start
            ))

        # 5) Write alerts
        if not alerts_out:
            print(f"[INFERENCE] batch_id={batch_id} sin alertas")
            return

        alerts_sdf = spark.createDataFrame(alerts_out, schema=alert_schema)

        (
            alerts_sdf.write
            .format("delta")
            .mode("append")
            .save(alerts_path)
        )

        print(f"[INFERENCE] batch_id={batch_id} ALERTAS={len(alerts_out)}")
        alerts_sdf.show(truncate=False)

    except Exception as e:
        print("🔥 ERROR REAL EN INFERENCE FOREACH BATCH 🔥")
        traceback.print_exc()
        raise


In [0]:
query = (
    stream_joined.writeStream
    .foreachBatch(process_inference_batch)
    .option("checkpointLocation", checkpoint_path)
    .start()
)

query.awaitTermination()


[INFERENCE] batch_id=259 rows=0
[INFERENCE] batch_id=259 vacío
[INFERENCE] batch_id=260 rows=0
[INFERENCE] batch_id=260 vacío
[INFERENCE] batch_id=261 rows=0
[INFERENCE] batch_id=261 vacío
[INFERENCE] batch_id=262 rows=188
[INFERENCE] batch_id=262 sin alertas
[INFERENCE] batch_id=263 rows=0
[INFERENCE] batch_id=263 vacío
[INFERENCE] batch_id=264 rows=0
[INFERENCE] batch_id=264 vacío
[INFERENCE] batch_id=265 rows=0
[INFERENCE] batch_id=265 vacío
[INFERENCE] batch_id=266 rows=0
[INFERENCE] batch_id=266 vacío
[INFERENCE] batch_id=267 rows=0
[INFERENCE] batch_id=267 vacío
[INFERENCE] batch_id=268 rows=0
[INFERENCE] batch_id=268 vacío
[INFERENCE] batch_id=269 rows=0
[INFERENCE] batch_id=269 vacío
[INFERENCE] batch_id=270 rows=0
[INFERENCE] batch_id=270 vacío
[INFERENCE] batch_id=271 rows=0
[INFERENCE] batch_id=271 vacío
[INFERENCE] batch_id=272 rows=0
[INFERENCE] batch_id=272 vacío
[INFERENCE] batch_id=273 rows=0
[INFERENCE] batch_id=273 vacío
[INFERENCE] batch_id=274 rows=0
[INFERENCE] bat

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# from pyspark.sql.types import *

# alerts_schema = StructType([
#     StructField("timestamp_alert", TimestampType(), False),
#     StructField("asset_name", StringType(), False),
#     StructField("symbol", StringType(), False),
#     StructField("best_x", IntegerType(), False),
#     StructField("best_k", IntegerType(), False),
#     StructField("best_n", IntegerType(), False),
#     StructField("best_j", DoubleType(), False),
#     StructField("signal", StringType(), False),
#     StructField("price", DoubleType(), False),
#     StructField("bin_start", TimestampType(), False),
# ])

# alerts_path = "abfss://datos@mastertfm001sta.dfs.core.windows.net/gold/alerts"

# # Crear tabla Delta vacía si no existe
# (
#     spark.createDataFrame([], alerts_schema)
#     .write
#     .format("delta")
#     .mode("overwrite")
#     .save(alerts_path)
# )

# print("✅ Tabla Delta GOLD/alerts inicializada")


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:465)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data