In [0]:
!pip install alpaca-py

Collecting alpaca-py
  Downloading alpaca_py-0.43.2-py3-none-any.whl.metadata (13 kB)
Collecting msgpack<2.0.0,>=1.0.3 (from alpaca-py)
  Downloading msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting sseclient-py<2.0.0,>=1.7.2 (from alpaca-py)
  Downloading sseclient_py-1.9.0-py3-none-any.whl.metadata (1.9 kB)
Collecting websockets>=10.4 (from alpaca-py)
  Downloading websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (6.8 kB)
Downloading alpaca_py-0.43.2-py3-none-any.whl (122 kB)
Downloading msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (427 kB)
Downloading sseclient_py-1.9.0-py3-none-any.whl (8.4 kB)
Downloading websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl (184 kB)
Installing collected packages: websockets, sseclient-py, msgpack, alpaca-py
Successfully installed alp

In [0]:
# ====================================================================
# ========== INGESTA OHLCV 1m HISTÓRICO DESDE ALPACA A BRONZE (DELTA) ==
# ====================================================================

from datetime import datetime, timezone, timedelta
from typing import Dict, List

from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame

from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, dayofmonth, col
from pyspark.sql.types import DoubleType

# ================= CONFIGURACIÓN =================


API_KEY_ID = dbutils.secrets.get(
    scope="kv-scope",
    key="alpaca-key"
)

SECRET_KEY = dbutils.secrets.get(
    scope="kv-scope",
    key="alpaca-secret-key"
)

TZ_DEFAULT = "Europe/Madrid"
SOURCE_NAME = "alpaca"

START_TIME = datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc)
END_TIME   = datetime(2025, 12, 31, 23, 59, tzinfo=timezone.utc)

bronze_base_path = (
    "abfss://datos@mastertfm002sta.dfs.core.windows.net/bronze/activos"
)

# 🔵 Spark con soporte Delta
spark = (
    SparkSession.builder
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

# ================= ACTIVOS =================

ASSET_GROUPS: Dict[str, List[Dict[str, str]]] = {
    "Acciones(S&P500)": [
        {"symbol": "TSLA"}, {"symbol": "NVDA"}, {"symbol": "AMD"},
        {"symbol": "COIN"}, {"symbol": "PLTR"}, {"symbol": "RIVN"},
        {"symbol": "SHOP"}, {"symbol": "LCID"}, {"symbol": "ZM"},
        {"symbol": "SPCE"}, {"symbol": "KO"}, {"symbol": "PG"},
        {"symbol": "JNJ"}, {"symbol": "PEP"}, {"symbol": "WMT"},
        {"symbol": "MCD"}, {"symbol": "VZ"}, {"symbol": "DUK"},
        {"symbol": "UL"}, {"symbol": "V"},
    ],
    "Fondos(ETFs)": [
        {"symbol": "SPY"}, {"symbol": "QQQ"}, {"symbol": "EEM"},
        {"symbol": "VGK"}, {"symbol": "AGG"}, {"symbol": "VNQ"},
        {"symbol": "ARKK"}, {"symbol": "VUG"}, {"symbol": "SCHD"},
        {"symbol": "SOXX"},
    ],
    "Commodities(ETF)": [
        {"symbol": "GLD"}, {"symbol": "SLV"}, {"symbol": "PPLT"},
        {"symbol": "PALL"}, {"symbol": "USO"}, {"symbol": "UNG"},
        {"symbol": "CORN"}, {"symbol": "SOYB"}, {"symbol": "WEAT"},
        {"symbol": "CANE"},
    ],
}

ALL_SYMBOLS = [
    item["symbol"]
    for group in ASSET_GROUPS.values()
    for item in group
]

print(f"[INFO] Símbolos a ingestar desde Alpaca: {len(ALL_SYMBOLS)}")

# ================= CLIENTE ALPACA =================

client = StockHistoricalDataClient(
    api_key=API_KEY_ID,
    secret_key=SECRET_KEY
)

# ================= LIMPIEZA PREVIA (DELTA) =================

# def delete_partitions_from_range(base_path, start_time, end_time):
#     start_date = start_time.date()
#     end_date = end_time.date()

#     current = start_date
#     while current <= end_date:
#         path = (
#             f"{base_path}/year={current.year}"
#             f"/month={current.month}"
#             f"/day={current.day}"
#         )
#         try:
#             dbutils.fs.rm(path, recurse=True)
#             print(f"[CLEAN] Eliminada partición {path}")
#         except Exception:
#             print(f"[CLEAN] No existe {path}, se omite")

#         current += timedelta(days=1)

# print(
#     f"[INFO] Limpiando Bronze Delta desde {START_TIME.date()} "
#     f"hasta {END_TIME.date()}"
# )

# delete_partitions_from_range(
#     bronze_base_path,
#     START_TIME,
#     END_TIME
# )

# ================= INGESTA HISTÓRICA =================

for symbol in ALL_SYMBOLS:
    try:
        print(f"[INFO] Descargando {symbol} ...")

        request = StockBarsRequest(
            symbol_or_symbols=symbol,
            timeframe=TimeFrame.Minute,
            start=START_TIME,
            end=END_TIME,
            feed="iex"
        )

        bars = client.get_stock_bars(request)

        if bars.df.empty:
            print(f"[WARN] {symbol}: sin datos")
            continue

        pdf = bars.df.reset_index()

        pdf.rename(columns={
            "timestamp": "Datetime",
            "open": "Open",
            "high": "High",
            "low": "Low",
            "close": "Close",
            "volume": "Volume",
        }, inplace=True)

        pdf["symbol"] = symbol
        pdf["timezone"] = TZ_DEFAULT
        pdf["source"] = SOURCE_NAME

        sdf = spark.createDataFrame(pdf)

        sdf = (
            sdf
            .withColumnRenamed("Datetime", "timestamp")
            .withColumnRenamed("Open", "open")
            .withColumnRenamed("High", "high")
            .withColumnRenamed("Low", "low")
            .withColumnRenamed("Close", "close")
            .withColumnRenamed("Volume", "volume")
            .withColumn("volume", col("volume").cast(DoubleType()))
            .withColumn("year", year("timestamp"))
            .withColumn("month", month("timestamp"))
            .withColumn("day", dayofmonth("timestamp"))
            .select(
                "timestamp",
                "symbol",
                "open",
                "high",
                "low",
                "close",
                "volume",
                "timezone",
                "source",
                "year",
                "month",
                "day"
            )
        )

        # 🔵 ESCRITURA EN DELTA
        (
            sdf.write
            .format("delta")
            .mode("append")
            .partitionBy("year", "month", "day")
            .save(bronze_base_path)
        )

        print(f"[OK] {symbol} escrito en Bronze Delta")

    except Exception as e:
        print(f"[ERROR] {symbol}: {e}")


[INFO] Símbolos a ingestar desde Alpaca: 40
[INFO] Descargando TSLA ...
[WARN] TSLA: sin datos
[INFO] Descargando NVDA ...
[WARN] NVDA: sin datos
[INFO] Descargando AMD ...
[WARN] AMD: sin datos
[INFO] Descargando COIN ...
[WARN] COIN: sin datos
[INFO] Descargando PLTR ...
[WARN] PLTR: sin datos
[INFO] Descargando RIVN ...
[WARN] RIVN: sin datos
[INFO] Descargando SHOP ...
[WARN] SHOP: sin datos
[INFO] Descargando LCID ...
[WARN] LCID: sin datos
[INFO] Descargando ZM ...
[WARN] ZM: sin datos
[INFO] Descargando SPCE ...
[WARN] SPCE: sin datos
[INFO] Descargando KO ...
[WARN] KO: sin datos
[INFO] Descargando PG ...
[WARN] PG: sin datos
[INFO] Descargando JNJ ...
[WARN] JNJ: sin datos
[INFO] Descargando PEP ...
[WARN] PEP: sin datos
[INFO] Descargando WMT ...
[WARN] WMT: sin datos
[INFO] Descargando MCD ...
[WARN] MCD: sin datos
[INFO] Descargando VZ ...
[WARN] VZ: sin datos
[INFO] Descargando DUK ...
[WARN] DUK: sin datos
[INFO] Descargando UL ...
[WARN] UL: sin datos
[INFO] Descargando 