In [0]:
# def clean_order_items_data(df):
#     """Transformaciones específicas para ítems de órdenes"""
#     from pyspark.sql.functions import col, when, round, current_timestamp
    
#     return df \
#         # Validación y limpieza de cantidad
#         .withColumn("quantity_clean",
#                    when(col("quantity").isNull() | (col("quantity") <= 0), 1)
#                    .otherwise(col("quantity"))) \
        
#         # Validación de precios
#         .withColumn("list_price_clean",
#                    when(col("list_price").isNull() | (col("list_price") <= 0), 0.01)
#                    .otherwise(round(col("list_price"), 2))) \
        
#         # Validación de descuentos
#         .withColumn("discount_clean",
#                    when(col("discount").isNull(), 0.0)
#                    .when(col("discount") < 0, 0.0)
#                    .when(col("discount") > 0.99, 0.99)  # Máximo 99% de descuento
#                    .otherwise(round(col("discount"), 2))) \
        
#         # Cálculo de valores derivados
#         .withColumn("net_price", 
#                    round(col("list_price_clean") * (1 - col("discount_clean")), 2)) \
#         .withColumn("line_total", 
#                    round(col("net_price") * col("quantity_clean"), 2)) \
        
#         # Validación de relaciones con otras tablas
#         .withColumn("valid_product",
#                    when(col("product_id").isin(get_valid_product_ids()), 1)
#                    .otherwise(0)) \
        
#         # Metadata
#         .withColumn("processed_at", current_timestamp()) \
#         .withColumn("processing_date", current_date()) \
        
#         # Filtrar registros inválidos
#         .filter(col("valid_product") == 1) \
#         .drop("valid_product")

# def clean_orders_data(df):
#     """Transformaciones específicas para órdenes"""
#     from pyspark.sql.functions import col, when, datediff, current_date, to_date, current_timestamp
    
#     return df \
#         .withColumn("order_date", to_date(col("order_date"))) \
#         .withColumn("required_date", to_date(col("required_date"))) \
#         .withColumn("shipped_date", 
#                    when((col("shipped_date").isNotNull()) & 
#                         (col("shipped_date") >= col("order_date")) & 
#                         (col("shipped_date") <= current_date()),
#                    to_date(col("shipped_date")))) \
#         .withColumn("order_status_clean",
#                    when(~col("order_status").isin([1, 2, 3, 4]), 1)
#                    .when((col("shipped_date").isNull()) & (col("order_status") == 4), 2)
#                    .otherwise(col("order_status"))) \
#         .withColumn("days_to_ship",
#                    when(col("shipped_date").isNotNull(),
#                         datediff(col("shipped_date"), col("order_date")))) \
#         .withColumn("processed_at", current_timestamp())

In [0]:
from pyspark.sql.functions import col, when, round, trim, initcap, year, current_date, current_timestamp

In [0]:
container = dbutils.secrets.get("scope-mbc", "secret-env-container")
storage_account = dbutils.secrets.get("scope-mbc", "secret-env-storage-account")
path_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net"

In [0]:
# Configuración de nombres de tablas (managed)
tabla_bronze = "bronze.products"  # Ejemplo con tabla de órdenes en Bronze
tabla_silver = "silver.products"  # Tabla destino en Silver
path_checkpoint = f"{path_base}/checkpoints/bronze_to_silver/{tabla_silver}/"

# Limpieza de checkpoint (usar solo cuando sea necesario)
# dbutils.fs.rm(path_checkpoint, recurse=True)

def transform_to_silver(batch_df, batch_id):
    """
    Función de transformación para procesamiento por lotes
    """
    # Ejemplo de transformación para la tabla de órdenes
    if tabla_silver.endswith("orders"):
        transformed_df = batch_df.transform(clean_orders_data)
    elif tabla_silver.endswith("products"):
        transformed_df = batch_df.transform(clean_products_data)
    elif tabla_silver.endswith("order_items"):
        transformed_df = batch_df.transform(clean_order_items_data)
    else:
        transformed_df = batch_df  # Transformación por defecto
    
    # Escribir el batch transformado a tabla managed
    transformed_df.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable(tabla_silver)  # Esto crea/actualiza una tabla managed

def clean_products_data(df):
    """Transformaciones específicas para productos"""
    
    current_year = year(current_date())
    
    return df \
        .withColumn("product_name", trim(col("product_name"))) \
        .withColumn("brand_name", initcap(trim(col("brand_name")))) \
        .withColumn("category_name", initcap(trim(col("category_name")))) \
        .withColumn("model_year_clean",
                   when(col("model_year") < 2000, 2000)
                   .when(col("model_year") > current_year + 1, current_year)
                   .otherwise(col("model_year"))) \
        .withColumn("list_price_clean",
                   when(col("list_price") <= 0, 1.00)
                   .otherwise(round(col("list_price"), 2))) \
        .withColumn("processed_at", current_timestamp())

In [0]:
# Iniciar el stream leyendo desde tabla managed
df_stream = (
    spark.readStream
    .format("delta")
    .table(tabla_bronze)  # Leer desde tabla managed en lugar de path
    .writeStream
    .foreachBatch(transform_to_silver)  # Aplicar transformaciones por batch
    .option("checkpointLocation", path_checkpoint)
    #.trigger(availableNow=True)  # Para procesamiento en lotes
    # .trigger(processingTime='1 minute')  # Para procesamiento continuo
    .start()
)

# Esperar a que termine el procesamiento
df_stream.awaitTermination()