In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

def create_silver_spark_session():
    spark = (
        SparkSession.builder
        .appName("SilverLayer")
        .config('spark.jars.packages', 
                'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,'
                'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.96.1,'
                'org.apache.hadoop:hadoop-aws:3.3.4')
        .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v1")
        .config("spark.sql.catalog.nessie.ref", "main")
        .config("spark.sql.catalog.nessie.authentication.type", "NONE")
        .config("spark.sql.catalog.nessie.warehouse", "s3a://lakehouse/")
        .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000")
        .config("spark.sql.catalog.nessie.s3.access-key-id", "minioadmin")
        .config("spark.sql.catalog.nessie.s3.secret-access-key", "minioadmin")
        .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
        .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.sql.adaptive.enabled", "true")
        # Configuraciones para mejorar estabilidad
        .config("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
        .getOrCreate()
    )
    return spark
    
spark = create_silver_spark_session()

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bb9fd9bb-ba8b-41d5-8a69-b4c658c875c8;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.96.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 393ms :: artifacts dl 15ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central

In [2]:
def read_bronze_data(table_name, limit=None):
    """
    Lee datos de la capa Bronze desde MinIO
    """
    bronze_path = f"s3a://lakehouse/comments_2021/comments_2021/{table_name}"
    
    df = spark.read.parquet(bronze_path)
    
    if limit:
        df = df.limit(limit)
        
    print(f"Leídos {df.count()} registros de {table_name}")
    return df

In [3]:
def transform_comments_to_silver():
    """
    Transforma datos de comments - los textos están en formato binary/UTF-8, no Base64
    """
    # Leer datos de bronze
    comments_df = read_bronze_data("1760381569.7765708.45804a8704.parquet", limit=100000)
    
    # Agregar columna de fecha de cargue
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Función UDF para convertir binary a string UTF-8
    from pyspark.sql.functions import udf
    from pyspark.sql.types import StringType
    
    def binary_to_utf8(binary_data):
        try:
            if binary_data is None:
                return None
            return binary_data.decode('utf-8')
        except Exception as e:
            return f"[DECODE_ERROR: {str(e)}]"
    
    binary_to_utf8_udf = udf(binary_to_utf8, StringType())
    
    silver_comments = (
        comments_df
        .withColumn("load_date", lit(current_timestamp).cast("timestamp"))
        .withColumn("comment_year", year(to_timestamp(col("creation_date"))))
        .withColumn("comment_month", month(to_timestamp(col("creation_date"))))
        .withColumn("comment_day", dayofmonth(to_timestamp(col("creation_date"))))
        .withColumnRenamed("id", "comment_id")
        .withColumnRenamed("text", "comment_text_binary")
        # Convertir binary a string UTF-8
        .withColumn("comment_text", binary_to_utf8_udf(col("comment_text_binary")))
        .withColumn("user_display_name_decoded", 
                   binary_to_utf8_udf(col("user_display_name")))
        .withColumn("text_length", 
                   when(col("comment_text").isNotNull(), 
                        length(col("comment_text")))
                   .otherwise(0))
        .withColumn("has_user_display_name", 
                   when(col("user_display_name_decoded").isNull() | 
                        (col("user_display_name_decoded") == ""), 
                        False).otherwise(True))
        .withColumn("score_category",
                   when(col("score") >= 5, "high")
                   .when(col("score") >= 1, "medium")
                   .otherwise("low"))
        .withColumn("is_text_decoded", 
                   ~col("comment_text").contains("[DECODE_ERROR]"))
        .select(
            "comment_id",
            "post_id",
            "score",
            "score_category",
            "comment_text",  # Texto decodificado
            "text_length",
            "is_text_decoded",
            "creation_date",
            "comment_year",
            "comment_month", 
            "comment_day",
            "user_id",
            "user_display_name_decoded",  # Display name decodificado
            "has_user_display_name",
            "load_date"
        )
    )
    
    return silver_comments

In [4]:
def setup_nessie_namespaces():
    """
    Crear los namespaces necesarios en Nessie
    """
    print("=== CONFIGURANDO NAMESPACES EN NESSIE ===")
    
    try:
        # Verificar namespaces existentes
        print("Namespaces existentes:")
        spark.sql("SHOW NAMESPACES IN nessie").show()
    except Exception as e:
        print(f"Error mostrando namespaces: {e}")
    
    # Crear namespace silver si no existe
    try:
        print("Creando namespace 'silver'...")
        spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.silver")
        print("✅ Namespace 'silver' creado exitosamente")
    except Exception as e:
        print(f"Error creando namespace silver: {e}")
        # Intentar con comando alternativo
        try:
            spark.sql("CREATE SCHEMA IF NOT EXISTS nessie.silver")
            print("✅ Schema 'silver' creado exitosamente")
        except Exception as e2:
            print(f"Error creando schema: {e2}")
    
    # Verificar que se creó
    try:
        print("Namespaces después de la creación:")
        spark.sql("SHOW NAMESPACES IN nessie").show()
    except Exception as e:
        print(f"Error verificando namespaces: {e}")

# Ejecutar la configuración primero
setup_nessie_namespaces()

=== CONFIGURANDO NAMESPACES EN NESSIE ===
Namespaces existentes:
+---------+
|namespace|
+---------+
|   silver|
+---------+

Creando namespace 'silver'...
✅ Namespace 'silver' creado exitosamente
Namespaces después de la creación:
+---------+
|namespace|
+---------+
|   silver|
+---------+



In [5]:
def merge_into_silver_table(silver_df, table_name, key_columns):
    """
    Realiza MERGE en tabla Iceberg con mejor manejo de errores
    """
    silver_table_path = f"nessie.silver.{table_name}"
    
    print(f"Intentando MERGE en: {silver_table_path}")
    
    try:
        # Verificar si la tabla existe
        spark.sql(f"DESCRIBE {silver_table_path}").show()
        table_exists = True
        print(f"Tabla {silver_table_path} existe")
    except Exception as e:
        table_exists = False
        print(f"Tabla {silver_table_path} no existe, creándola: {e}")
    
    if not table_exists:
        print(f"Creando nueva tabla Iceberg: {silver_table_path}")
        try:
            # Crear la tabla si no existe
            (silver_df
             .writeTo(silver_table_path)
             .using("iceberg")
             .createOrReplace())
            print("Tabla creada exitosamente")
        except Exception as e:
            print(f"Error creando tabla: {e}")
            # Intentar con CREATE TABLE SQL
            try:
                # Crear tabla manualmente
                create_table_sql = f"""
                CREATE TABLE {silver_table_path} (
                    comment_id LONG,
                    post_id LONG,
                    score LONG,
                    score_category STRING,
                    comment_text STRING,
                    text_length INT,
                    is_text_decoded BOOLEAN,
                    creation_date TIMESTAMP,
                    comment_year INT,
                    comment_month INT,
                    comment_day INT,
                    user_id LONG,
                    user_display_name_decoded STRING,
                    has_user_display_name BOOLEAN,
                    load_date TIMESTAMP
                ) USING iceberg
                """
                spark.sql(create_table_sql)
                print("Tabla creada con SQL exitosamente")
            except Exception as e2:
                print(f"Error creando tabla con SQL: {e2}")
                return
    else:
        print(f"Realizando MERGE en tabla existente: {silver_table_path}")
        
        try:
            # Crear temp view para el MERGE
            silver_df.createOrReplaceTempView("updates")
            
            # Construir condición ON dinámica
            on_condition = " AND ".join([f"target.{col} = updates.{col}" for col in key_columns])
            
            # Construir columnas para UPDATE (excluyendo keys)
            update_columns = [col for col in silver_df.columns if col not in key_columns]
            update_set = ", ".join([f"{col} = updates.{col}" for col in update_columns])
            
            # Construir columnas para INSERT
            insert_columns = ", ".join(silver_df.columns)
            insert_values = ", ".join([f"updates.{col}" for col in silver_df.columns])
            
            # Ejecutar MERGE
            merge_sql = f"""
            MERGE INTO {silver_table_path} AS target
            USING updates
            ON {on_condition}
            WHEN MATCHED THEN
                UPDATE SET {update_set}
            WHEN NOT MATCHED THEN
                INSERT ({insert_columns})
                VALUES ({insert_values})
            """
            
            print("Ejecutando MERGE...")
            spark.sql(merge_sql)
            print("MERGE completado exitosamente")
            
        except Exception as e:
            print(f"Error en MERGE: {e}")
            print("Intentando INSERT OVERWRITE como fallback...")
            try:
                # Fallback: usar overwrite
                (silver_df
                 .writeTo(silver_table_path)
                 .using("iceberg")
                 .overwrite())
                print("OVERWRITE completado exitosamente")
            except Exception as e2:
                print(f"Error en OVERWRITE: {e2}")
    
    # Mostrar estadísticas
    try:
        final_count = spark.sql(f"SELECT COUNT(*) as total FROM {silver_table_path}").collect()[0]['total']
        print(f"Total registros en {table_name}: {final_count}")
    except Exception as e:
        print(f"Error contando registros: {e}")

In [6]:
def process_silver_layer():
    """
    Proceso completo para transformar Bronze → Silver
    """
    print("=== INICIANDO PROCESO SILVER LAYER ===")
    
    # 1. Configurar namespace primero
    setup_nessie_namespaces()
    
    # 2. Transformar Comments
    print("2. Transformando Comments...")
    silver_comments = transform_comments_to_silver()
    
    # Forzar evaluación
    silver_comments = silver_comments.cache()
    record_count = silver_comments.count()
    print(f"Transformados {record_count} registros")
    
    print("Muestra de datos transformados:")
    silver_comments.select(
        "comment_id", "post_id", "score", "score_category",
        "text_length", "creation_date", "user_id", "is_text_decoded"
    ).show(10, truncate=False)
    
    # 3. Estadísticas de decodificación
    print("3. Estadísticas de decodificación:")
    silver_comments.groupBy("is_text_decoded").count().show()
    
    # 4. Hacer MERGE a tabla Iceberg
    valid_comments = silver_comments.filter(col("is_text_decoded") == True)
    
    if valid_comments.count() > 0:
        print(f"4. Realizando MERGE con {valid_comments.count()} comentarios válidos...")
        
        # Usar método más simple para la primera creación
        try:
            # Primero crear la tabla si no existe
            (valid_comments.limit(1)  # Solo schema
             .writeTo("nessie.silver.comments")
             .using("iceberg")
             .createOrReplace())
            print("✅ Tabla 'comments' creada exitosamente")
            
            # Luego hacer append de los datos
            valid_comments.writeTo("nessie.silver.comments").append()
            print("✅ Datos insertados exitosamente")
            
        except Exception as e:
            print(f"❌ Error en proceso principal: {e}")
            print("Intentando método alternativo...")
            
            # Método alternativo: crear con SQL
            try:
                create_sql = """
                CREATE TABLE IF NOT EXISTS nessie.silver.comments (
                    comment_id LONG,
                    post_id LONG,
                    score LONG,
                    score_category STRING,
                    comment_text STRING,
                    text_length INT,
                    is_text_decoded BOOLEAN,
                    creation_date TIMESTAMP,
                    comment_year INT,
                    comment_month INT,
                    comment_day INT,
                    user_id LONG,
                    user_display_name_decoded STRING,
                    has_user_display_name BOOLEAN,
                    load_date TIMESTAMP
                ) USING iceberg
                """
                spark.sql(create_sql)
                print("✅ Tabla creada con SQL")
                
                # Insertar datos
                valid_comments.writeTo("nessie.silver.comments").append()
                print("✅ Datos insertados")
                
            except Exception as e2:
                print(f"❌ Error método alternativo: {e2}")
    else:
        print("4. No hay comentarios válidos para procesar")
    
    # 5. Verificar datos en Silver
    print("5. Verificando datos en Silver...")
    try:
        spark.sql("SHOW TABLES IN nessie.silver").show()
        spark.sql("SELECT COUNT(*) as total FROM nessie.silver.comments").show()
    except Exception as e:
        print(f"Error verificando datos: {e}")
    
    print("=== PROCESO SILVER COMPLETADO ===")
    
    return silver_comments

In [None]:
process_silver_layer()

=== INICIANDO PROCESO SILVER LAYER ===
=== CONFIGURANDO NAMESPACES EN NESSIE ===
Namespaces existentes:
+---------+
|namespace|
+---------+
|   silver|
+---------+

Creando namespace 'silver'...
✅ Namespace 'silver' creado exitosamente
Namespaces después de la creación:
+---------+
|namespace|
+---------+
|   silver|
+---------+

2. Transformando Comments...


25/10/13 19:30:18 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

Leídos 100000 registros de 1760381569.7765708.45804a8704.parquet


                                                                                

Transformados 100000 registros
Muestra de datos transformados:
+----------+-------+-----+--------------+-----------+-----------------------+--------+---------------+
|comment_id|post_id|score|score_category|text_length|creation_date          |user_id |is_text_decoded|
+----------+-------+-----+--------------+-----------+-----------------------+--------+---------------+
|116230672 |5249797|0    |low           |160        |2021-01-15 15:32:10.917|1999993 |true           |
|120551781 |5250531|0    |low           |105        |2021-07-01 12:06:14.767|144408  |true           |
|123758678 |5255237|0    |low           |504        |2021-11-17 19:04:42.303|1028230 |true           |
|117245192 |5256426|1    |medium        |355        |2021-02-22 15:16:04.207|1250772 |true           |
|121632250 |5256470|0    |low           |134        |2021-08-17 21:00:45.4  |13738662|true           |
|123328265 |5259967|0    |low           |116        |2021-10-29 14:55:05.95 |40899   |true           |
|120487510

                                                                                

✅ Tabla 'comments' creada exitosamente


                                                                                

✅ Datos insertados exitosamente
5. Verificando datos en Silver...
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   silver| comments|      false|
+---------+---------+-----------+

