In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

def create_optimized_gold_spark_session():
    """
    Crea una sesión de Spark optimizada para la capa Gold con configuraciones específicas para agregaciones
    """
    try:
        spark = (
            SparkSession.builder
            .appName("GoldLayer-Optimized")
            .config('spark.jars.packages', 
                    'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,'
                    'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.96.1,'
                    'org.apache.hadoop:hadoop-aws:3.3.4,'
                    'software.amazon.awssdk:bundle:2.20.18')
            
            # Configuración Nessie/Iceberg para Gold
            .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
            .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v1")
            .config("spark.sql.catalog.nessie.ref", "main")
            .config("spark.sql.catalog.nessie.authentication.type", "NONE")
            .config("spark.sql.catalog.nessie.warehouse", "s3a://lakehouse/")
            .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
            .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
            
            # Configuración S3/MinIO optimizada
            .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000")
            .config("spark.sql.catalog.nessie.s3.access-key-id", "minioadmin")
            .config("spark.sql.catalog.nessie.s3.secret-access-key", "minioadmin")
            .config("spark.sql.catalog.nessie.s3.path-style-access", "true")
            
            # Configuración Hadoop/S3A optimizada
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.connection.maximum", "100")
            .config("spark.hadoop.fs.s3a.attempts.maximum", "10")
            .config("spark.hadoop.fs.s3a.retry.limit", "5")
            
            # Extensiones
            .config("spark.sql.extensions", 
                   "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,"
                   "org.projectnessie.spark.extensions.NessieSparkSessionExtensions")
            
            # OPTIMIZACIONES ESPECÍFICAS PARA GOLD (AGREGACIONES)
            .config("spark.driver.memory", "4g")           # Más memoria para procesamiento complejo
            .config("spark.executor.memory", "4g")         
            .config("spark.sql.adaptive.enabled", "true")  # Crucial para agregaciones
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
            .config("spark.sql.adaptive.skewJoin.enabled", "true")
            .config("spark.sql.adaptive.advisoryPartitionSizeInBytes", "128MB")  # Optimizar tamaño particiones
            .config("spark.sql.shuffle.partitions", "200")  # Más particiones para shuffles grandes
            
            # OPTIMIZACIONES PARA AGREGACIONES
            .config("spark.sql.autoBroadcastJoinThreshold", "50MB")  # Para joins en agregaciones
            .config("spark.sql.files.maxPartitionBytes", "128MB")    # Tamaño de particiones de lectura
            .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
            
            # OPTIMIZACIONES PARA ICEBERG EN GOLD
            .config("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
            .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
            .config("spark.sql.catalog.nessie.vectorization-enabled", "true")  # Habilitado para Gold
            .config("spark.sql.iceberg.optimized-scan-enabled", "true")  # Scans optimizados
            
            # COMPRESIÓN Y SERIALIZACIÓN OPTIMIZADA
            .config("spark.sql.inMemoryColumnarStorage.compressed", "true")
            .config("spark.sql.inMemoryColumnarStorage.batchSize", "20000")  # Batch más grande para agregaciones
            .config("spark.sql.parquet.compression.codec", "zstd")  # Mejor compresión para Gold
            .config("spark.sql.orc.compression.codec", "zstd")
            
            # CACHE Y PERFORMANCE
            .config("spark.sql.adaptive.optimizeSkewsInRebalancePartitions.enabled", "true")
            .config("spark.sql.adaptive.rebalancePartitionsSmallPartitionFactor", "0.2")
            
            # MANEJO DE MEMORIA PARA OPERACIONES COMPLEJAS
            .config("spark.memory.fraction", "0.8")
            .config("spark.memory.storageFraction", "0.3")
            .config("spark.sql.windowExec.buffer.in.memory.threshold", "100000")
            .config("spark.sql.windowExec.buffer.spill.threshold", "10000")
            
            .getOrCreate()
        )
        
        # Configuración adicional
        spark.sparkContext.setLogLevel("WARN")
        
        print("✅ Sesión Spark Gold optimizada creada exitosamente")
        print("🔧 Configuraciones específicas para agregaciones y métricas aplicadas")
        
        return spark
        
    except Exception as e:
        print(f"❌ Error creando sesión Spark Gold: {e}")
        raise

# Crear sesión optimizada para Gold
spark = create_optimized_gold_spark_session()

In [None]:
from pyspark.sql import functions as F

def setup_gold_namespace():
    """
    Configurar el namespace Gold en Nessie
    """
    print("=== CONFIGURANDO NAMESPACE GOLD ===")
    
    try:
        # Verificar namespaces existentes
        print("Namespaces existentes:")
        spark.sql("SHOW NAMESPACES IN nessie").show()
    except Exception as e:
        print(f"Error mostrando namespaces: {e}")
    
    # Crear namespace gold si no existe
    try:
        print("Creando namespace 'gold'...")
        spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.gold")
        print("✅ Namespace 'gold' creado exitosamente")
    except Exception as e:
        print(f"Error creando namespace gold: {e}")
        try:
            spark.sql("CREATE SCHEMA IF NOT EXISTS nessie.gold")
            print("✅ Schema 'gold' creado exitosamente")
        except Exception as e2:
            print(f"Error creando schema: {e2}")
    
    # Verificar que se creó
    try:
        print("Namespaces después de la creación:")
        spark.sql("SHOW NAMESPACES IN nessie").show()
    except Exception as e:
        print(f"Error verificando namespaces: {e}")


# =====================================================
#  MÉTRICA 1: USER_POST_METRICS
# =====================================================

def create_simple_user_post_metrics():
    """
    Métrica: cantidad de preguntas y respuestas por usuario
    """
    print("=== CREANDO USER POST METRICS ===")
    
    posts_df = spark.table("nessie.silver.posts")
    print(f"📊 Total posts en silver: {posts_df.count()}")
    
    agg_df = (
        posts_df
        .filter(F.col("owner_user_id").isNotNull())
        .groupBy("owner_user_id")
        .agg(
            F.count(F.when(F.col("post_type_id") == 1, True)).alias("question_count"),
            F.count(F.when(F.col("post_type_id") == 2, True)).alias("answer_count"),
            F.count("*").alias("total_posts")
        )
        .withColumnRenamed("owner_user_id", "user_id")
        .withColumn("fecha_cargue", F.current_timestamp())
    )
    
    print(f"📈 Usuarios únicos con posts: {agg_df.count()}")
    agg_df.show(5)
    
    return agg_df


# =====================================================
#  MÉTRICA 2: BADGES_SUMMARY
# =====================================================

def create_badges_summary():
    """
    Métrica: resumen de insignias por usuario
    """
    print("=== CREANDO BADGES SUMMARY ===")
    
    badges_df = spark.table("nessie.silver.badges")
    print(f"🏅 Total badges en silver: {badges_df.count()}")
    
    badges_summary_df = (
        badges_df
        .filter(F.col("user_id").isNotNull())
        .groupBy("user_id")
        .agg(
            F.count("*").alias("total_badges"),
            F.countDistinct("badge_name").alias("distinct_badge_types"),
            F.sum(F.when(F.col("badge_class") == 1, 1).otherwise(0)).alias("gold_badges"),
            F.sum(F.when(F.col("badge_class") == 2, 1).otherwise(0)).alias("silver_badges"),
            F.sum(F.when(F.col("badge_class") == 3, 1).otherwise(0)).alias("bronze_badges"),
            F.max("load_date").alias("fecha_cargue")
        )
    )
    
    print(f"👤 Usuarios únicos con insignias: {badges_summary_df.count()}")
    badges_summary_df.show(5)
    
    return badges_summary_df

def create_user_engagement_metrics():
    """
    Crea la tabla user_engagement en la capa GOLD:
    Interacciones totales por usuario (posts, comentarios, badges)
    """
    print("=== CREANDO USER ENGAGEMENT METRICS ===")

    # ---- Leer tablas base ----
    users_df = spark.table("nessie.silver.users").select("user_id", "display_name", "load_date")
    posts_df = spark.table("nessie.silver.posts").filter(F.col("owner_user_id").isNotNull())
    comments_df = spark.table("nessie.silver.comments").filter(F.col("user_id").isNotNull())
    badges_df = spark.table("nessie.silver.badges").filter(F.col("user_id").isNotNull())

    # ---- Agregaciones individuales ----
    post_agg = posts_df.groupBy("owner_user_id").agg(F.count("*").alias("total_posts"))
    comment_agg = comments_df.groupBy("user_id").agg(F.count("*").alias("total_comments"))
    badge_agg = badges_df.groupBy("user_id").agg(F.count("*").alias("total_badges"))

    # ---- Joins con users ----
    joined_df = (
        users_df
        .join(post_agg, users_df.user_id == post_agg.owner_user_id, "left")
        .join(comment_agg, "user_id", "left")
        .join(badge_agg, "user_id", "left")
        .drop("owner_user_id")
    )

    # ---- Calcular engagement_score ----
    result_df = (
        joined_df
        .withColumn("total_posts", F.coalesce(F.col("total_posts"), F.lit(0)))
        .withColumn("total_comments", F.coalesce(F.col("total_comments"), F.lit(0)))
        .withColumn("total_badges", F.coalesce(F.col("total_badges"), F.lit(0)))
        .withColumn(
            "engagement_score",
            F.col("total_posts") + F.col("total_comments") + F.col("total_badges")
        )
        .withColumn("fecha_cargue", F.current_timestamp())
    )

    print("✅ Métricas de engagement creadas correctamente")
    result_df.show(10)
    return result_df


# =====================================================
#  GUARDAR EN GOLD
# =====================================================

def save_simple_gold_table(df, table_name):
    """
    Guardar tabla en la capa Gold (formato Iceberg)
    """
    gold_table_path = f"nessie.gold.{table_name}"
    print(f"💾 Guardando en: {gold_table_path}")
    
    (
        df.writeTo(gold_table_path)
        .using("iceberg")
        .tableProperty("format-version", "2")
        .tableProperty("write.parquet.compression-codec", "snappy")
        .createOrReplace()
    )
    
    final_count = spark.sql(f"SELECT COUNT(*) as count FROM {gold_table_path}").collect()[0]['count']
    print(f"✅ Tabla {table_name} creada con {final_count} registros")


# =====================================================
#  PROCESO PRINCIPAL
# =====================================================

def process_gold_simple():
    """
    Proceso Gold: crea user_post_metrics + badges_summary
    """
    print("=== 🚀 INICIANDO PROCESO GOLD ===")
    
    setup_gold_namespace()
    
    # ---- 1️⃣ Métrica: User Post Metrics
    print("\n--- [1] USER POST METRICS ---")
    user_post_metrics_df = create_simple_user_post_metrics()
    save_simple_gold_table(user_post_metrics_df, "user_post_metrics")
    
    # ---- 2️⃣ Métrica: Badges Summary
    print("\n--- [2] BADGES SUMMARY ---")
    badges_summary_df = create_badges_summary()
    save_simple_gold_table(badges_summary_df, "badges_summary")

    # ---- 2️⃣ Métrica: User
    print("\n--- [3] USER ---")
    user_engagement = create_user_engagement_metrics()
    save_simple_gold_table(user_engagement, "user_engagement")

    
    # ---- Verificación final
    print("\n--- 🔍 VERIFICACIÓN FINAL ---")
    spark.sql("SHOW TABLES IN nessie.gold").show()
    
    print("Datos de badges_summary:")
    spark.sql("SELECT * FROM nessie.gold.badges_summary LIMIT 10").show()
    
    print("=== ✅ PROCESO GOLD COMPLETADO ===")


# Ejecutar proceso completo
process_gold_simple()