In [0]:
from pyspark.sql.functions import *

In [0]:
container = dbutils.secrets.get("scope-mbc", "secret-env-container")
storage_account = dbutils.secrets.get("scope-mbc", "secret-env-storage-account")
path_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net"

# Configuración de nombres de tablas
tables = {
    "products": {
        "bronze": "bronze.products",
        "silver": "silver.products",
        "checkpoint": f"{path_base}/checkpoints/bronze_to_silver/products/"
    },
    "orders": {
        "bronze": "bronze.orders",
        "silver": "silver.orders",
        "checkpoint": f"{path_base}/checkpoints/bronze_to_silver/orders/"
    },
    "order_items": {
        "bronze": "bronze.order_items",
        "silver": "silver.order_items",
        "checkpoint": f"{path_base}/checkpoints/bronze_to_silver/order_items/"
    }
}

In [0]:
def transform_products(df):
    """Transformaciones para la tabla products"""
    current_year = year(current_date())
    
    return df.withColumns({
        "product_name": trim(col("product_name")),
        "model_year_clean": when(col("model_year") < 2000, 2000)
                          .when(col("model_year") > current_year + 1, current_year)
                          .otherwise(col("model_year")),
        "list_price_clean": round(col("list_price"), 2),
        "is_active": lit(True),  # Nuevo campo
        "processed_at": current_timestamp(),
        "source_system": lit("legacy_system")  # Metadata
    })

def write_to_silver(table_name, transform_function):
    """Función genérica para escribir datos en silver"""
    config = tables[table_name]
    
    def foreach_batch_function(df, batch_id):
        # Aplicar transformaciones
        transformed_df = transform_function(df)
        
        # Validar esquema
        expected_columns = spark.table(config["silver"]).columns if spark.catalog.tableExists(config["silver"]) else transformed_df.columns
        missing_columns = [col for col in expected_columns if col not in transformed_df.columns]
        
        if missing_columns:
            raise ValueError(f"Faltan columnas en el DataFrame transformado: {missing_columns}")
        
        # Escribir en Silver
        (transformed_df
         .write
         .format("delta")
         .mode("append")
         .option("mergeSchema", "true")  # Permite evolución del esquema
         .saveAsTable(config["silver"]))
    
    # Iniciar el stream
    (spark.readStream
     .format("delta")
     .table(config["bronze"])
     .writeStream
     .foreachBatch(foreach_batch_function)
     .option("checkpointLocation", config["checkpoint"])
     .option("maxFilesPerTrigger", 100)  # Controlar el tamaño del lote
     .start())

In [0]:
# Diccionario para controlar los streams activos
active_streams = {}

# Iniciar stream para products
active_streams["products"] = write_to_silver("products", transform_products)