In [2]:
# ================================================================
#  gold_aggy.py — Capa Gold (Agregaciones y Métricas Analíticas)
# ================================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, countDistinct, count, avg, max, min, sum,
    year, month, dayofmonth, when, desc
)
from datetime import datetime

# ================================================================
# Crear sesión Spark (usa la ya existente del Silver)
# ================================================================
spark = SparkSession.builder.getOrCreate()

# Cambia esto si estás sin Nessie
CATALOG = "nessie"   # o "spark_catalog"

# ================================================================
# Función auxiliar — Crear namespace gold
# ================================================================
def setup_gold_namespace():
    print("=== CONFIGURANDO NAMESPACE GOLD ===")
    try:
        spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {CATALOG}.gold")
        print(f"✅ Namespace {CATALOG}.gold disponible")
    except Exception as e:
        print(f"⚠️ Error creando namespace gold: {e}")

# ================================================================
# Función: Métricas de actividad de comentarios
# ================================================================
def build_comment_activity():
    print("\n=== Construyendo tabla GOLD: comment_activity ===")

    df = spark.sql(f"SELECT * FROM {CATALOG}.silver.comments")

    result = (
        df.groupBy("comment_year", "comment_month")
          .agg(
              count("*").alias("total_comments"),
              countDistinct("user_id").alias("unique_users"),
              avg("text_length").alias("avg_length"),
              sum(when(col("score_category") == "high", 1).otherwise(0)).alias("high_score_comments")
          )
          .orderBy("comment_year", "comment_month")
    )

    result.writeTo(f"{CATALOG}.gold.comment_activity") \
        .using("iceberg") \
        .option("merge-schema", "true") \
        .createOrReplace()

    print("✅ Tabla GOLD comment_activity creada exitosamente")
    result.show(10, truncate=False)

# ================================================================
# Función: Resumen de usuarios activos
# ================================================================
def build_user_engagement():
    print("\n=== Construyendo tabla GOLD: user_engagement ===")

    users = spark.sql(f"SELECT * FROM {CATALOG}.silver.users")
    comments = spark.sql(f"SELECT * FROM {CATALOG}.silver.comments")

    joined = (
        comments.join(users, "user_id", "left")
        .groupBy("user_id", "display_name", "is_active")
        .agg(
            count("*").alias("total_comments"),
            avg("text_length").alias("avg_comment_length"),
            max("creation_date").alias("last_comment_date")
        )
        .orderBy(desc("total_comments"))
    )

    joined.writeTo(f"{CATALOG}.gold.user_engagement") \
        .using("iceberg") \
        .option("merge-schema", "true") \
        .createOrReplace()

    print("✅ Tabla GOLD user_engagement creada exitosamente")
    joined.show(10, truncate=False)

# ================================================================
# Función: Métricas de posts
# ================================================================
def build_post_summary():
    print("\n=== Construyendo tabla GOLD: post_summary ===")

    df = spark.sql(f"SELECT * FROM {CATALOG}.silver.posts")

    result = (
        df.groupBy("post_year", "post_month", "post_type_id")
          .agg(
              count("*").alias("total_posts"),
              countDistinct("owner_user_id").alias("unique_authors")
          )
          .orderBy("post_year", "post_month")
    )

    result.writeTo(f"{CATALOG}.gold.post_summary") \
        .using("iceberg") \
        .option("merge-schema", "true") \
        .createOrReplace()

    print("✅ Tabla GOLD post_summary creada exitosamente")
    result.show(10, truncate=False)

# ================================================================
# Función: Distribución de badges
# ================================================================
def build_badge_distribution():
    print("\n=== Construyendo tabla GOLD: badge_distribution ===")

    df = spark.sql(f"SELECT * FROM {CATALOG}.silver.badges")

    result = (
        df.groupBy("badge_name", "badge_year")
          .agg(
              count("*").alias("total_awards"),
              countDistinct("user_id").alias("unique_users")
          )
          .orderBy(desc("total_awards"))
    )

    result.writeTo(f"{CATALOG}.gold.badge_distribution") \
        .using("iceberg") \
        .option("merge-schema", "true") \
        .createOrReplace()

    print("✅ Tabla GOLD badge_distribution creada exitosamente")
    result.show(10, truncate=False)

# ================================================================
# Orquestador completo Gold Layer
# ================================================================
def process_gold_layer_complete():
    print("\n=== INICIANDO PROCESO GOLD LAYER ===")
    setup_gold_namespace()

    try:
        build_comment_activity()
        build_user_engagement()
        build_post_summary()
        build_badge_distribution()
    except Exception as e:
        print(f"❌ Error en proceso Gold: {e}")
        raise

    print("\n=== PROCESO GOLD COMPLETADO === ✅")
    print("Puedes consultar las tablas con:")
    print(f"  spark.sql('SELECT * FROM {CATALOG}.gold.comment_activity').show()")
    print(f"  spark.sql('SELECT * FROM {CATALOG}.gold.user_engagement').show()")

# ================================================================
# Ejecutar proceso
# ================================================================
if __name__ == "__main__":
    process_gold_layer_complete()



=== INICIANDO PROCESO GOLD LAYER ===
=== CONFIGURANDO NAMESPACE GOLD ===
✅ Namespace spark_catalog.gold disponible

=== Construyendo tabla GOLD: comment_activity ===
❌ Error en proceso Gold: [TABLE_OR_VIEW_NOT_FOUND] The table or view `spark_catalog`.`silver`.`comments` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [spark_catalog, silver, comments], [], false



AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `spark_catalog`.`silver`.`comments` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [spark_catalog, silver, comments], [], false
