In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Ventanas para gold
# Partimos de Silver
df_silver = spark.table("poctesting.silver_events")

# Ventanas
window_neigh = Window.partitionBy("neighborhood")
window_neigh_month = Window.partitionBy("neighborhood", "event_year", "event_month")
window_rank = Window.partitionBy("event_month").orderBy(F.sum("quantity_products").over(window_neigh_month).desc())
window_day = Window.partitionBy("neighborhood", "event_year", "event_month", "event_day")

# Construcción de métricas
df_gold = df_silver \
    .withColumn("total_by_neighborhood", F.sum("quantity_products").over(window_neigh)) \
    .withColumn("avg_by_neighborhood", F.avg("quantity_products").over(window_neigh)) \
    .withColumn("count_orders_by_neigh", F.count("order_id").over(window_neigh)) \
    .withColumn("unique_customers_by_neigh", F.approx_count_distinct("customer_id").over(window_neigh)) \
    .withColumn("monthly_total", F.sum("quantity_products").over(window_neigh_month)) \
    .withColumn("monthly_avg", F.avg("quantity_products").over(window_neigh_month)) \
    .withColumn("rank_in_month", F.dense_rank().over(window_rank)) \
    .withColumn("orders_per_day", F.count("order_id").over(window_day))

# Persistir en tabla Gold
df_gold.write.mode("overwrite").saveAsTable("poctesting.gold_events")

print(f"✅ Tabla 'poctesting.gold_events' creada con {df_gold.count()} registros enriquecidos.")

In [0]:
# display(df_gold.limit(10))

In [0]:
# # Ventanas por barrio y empleado
# window_neigh = Window.partitionBy("neighborhood")
# window_emp_total = Window.partitionBy("employee_id")
# window_rank_global = Window.orderBy(F.sum("quantity_products").over(window_emp_total).desc())

# # Aplicar funciones de ventana
# df_gold = df_silver \
#     .withColumn("total_by_neighborhood", F.sum("quantity_products").over(window_neigh)) \
#     .withColumn("total_by_employee", F.sum("quantity_products").over(window_emp_total)) \
#     .withColumn("rank_employee_quantity", F.dense_rank().over(window_rank_global)) #\
#     #.withColumn("cume_dist_employee", F.cume_dist().over(window_rank_global))