In [1]:
import pandas as pd
import re
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, explode, from_json, col, current_date,current_timestamp, lit
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, StructType, StructField, ArrayType


from pyspark.sql import SparkSession
import pyspark

In [6]:
spark.createDataFrame([(1, "test")], ["id", "dummy"]).writeTo("nessie.silver.test").createOrReplace()


In [2]:
CATALOG_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3a://silver/"              
STORAGE_URI = "http://minio:9000"
AWS_ACCESS_KEY = "admin"
AWS_SECRET_KEY = "password"

In [3]:
conf = (
    pyspark.SparkConf()
        .setAppName('silver_transform')

        # 📦 Dependencias necesarias para Spark + Iceberg + Nessie + AWS SDK + Hadoop AWS
        .set("spark.jars.packages", ",".join([
            "org.postgresql:postgresql:42.7.3",
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0",
            "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1",
            "software.amazon.awssdk:bundle:2.24.8",
            "software.amazon.awssdk:url-connection-client:2.24.8",
            "org.apache.hadoop:hadoop-aws:3.3.4"
        ]))

        # 🧩 Extensiones necesarias para Iceberg + Nessie
        .set("spark.sql.extensions", ",".join([
            "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
            "org.projectnessie.spark.extensions.NessieSparkSessionExtensions"
        ]))

        # 🗂️ Configuración del catálogo Nessie
        .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
        .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
        .set("spark.sql.catalog.nessie.uri", CATALOG_URI)
        .set("spark.sql.catalog.nessie.ref", "main")
        .set("spark.sql.catalog.nessie.authentication.type", "NONE")
        .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
        .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)

        # ☁️ Configuración de Hadoop S3A para MinIO
        .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .set("spark.hadoop.fs.s3a.endpoint", STORAGE_URI)
        .set("spark.hadoop.fs.s3a.path.style.access", "true")
        .set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
        .set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
        .set("spark.hadoop.fs.s3a.aws.credentials.provider",
             "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
        .set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

        # ⚡ Optimizaciones de ejecución y lectura
        .set("spark.sql.execution.arrow.pyspark.enabled", "true")  # ✅ habilita PyArrow
        .set("spark.sql.parquet.filterPushdown", "true")           # ✅ lee solo columnas necesarias
        .set("spark.sql.parquet.mergeSchema", "false")
        .set("spark.sql.shuffle.partitions", "32")                # 🔧 más particiones para evitar sobrecarga
        .set("spark.sql.files.maxPartitionBytes", "128MB")        # ⚖️ reduce tamaño de tarea

        # 🔧 Recursos
        .set('spark.driver.memory', '8g')             # más memoria para driver
        .set('spark.executor.memory', '8g')           # más memoria por executor
        .set('spark.executor.cores', '2')             # cores por executor
        .set('spark.driver.maxResultSize', '4g')      # evita overflow al traer datos grandes

        # ⚙️ Optimizaciones de escritura
        .set('spark.sql.parquet.compression.codec', 'snappy')
)



In [4]:
spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()
print("Spark Session Started")

Spark Session Started


In [5]:
spark.conf.get("spark.sql.catalog.nessie.uri")


'http://nessie:19120/api/v1'

In [7]:
try:
    from py4j.java_gateway import java_import

    # Import Hadoop classes
    java_import(spark._jvm, 'org.apache.hadoop.fs.FileSystem')
    java_import(spark._jvm, 'org.apache.hadoop.fs.Path')
    java_import(spark._jvm, 'java.net.URI')

    # ✅ Create S3A-aware FileSystem
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(
        spark._jvm.java.net.URI("s3a://bronze/"),
        spark._jsc.hadoopConfiguration()
    )

    def list_parquet_files(base_path):
        results = []
        path = spark._jvm.org.apache.hadoop.fs.Path(base_path)

        if not fs.exists(path):
            print(f"⚠️ Path does not exist: {base_path}")
            return results

        def recurse(p):
            try:
                status_list = fs.listStatus(p)
                for f in status_list:
                    if f.isDirectory():
                        recurse(f.getPath())
                    elif f.getPath().getName().endswith(".parquet"):
                        results.append(f.getPath().toString())
            except Exception as e:
                print(f"⚠️ Error listing {p}: {e}")

        recurse(path)
        return results

    # ===============================
    # List posts and votes folders
    # ===============================
    for subdir in ["posts", "votes"]:
        base = f"s3a://bronze/{subdir}/"
        print(f"\n🔍 Revisando {base}")

        try:
            path = spark._jvm.org.apache.hadoop.fs.Path(base)
            status = fs.listStatus(path)

            if len(status) == 0:
                print("⚠️ The bucket exists but is empty.")
            else:
                parquet_files = list_parquet_files(base)
                if parquet_files:
                    print(f"📦 {len(parquet_files)} Parquet files found in{subdir}:")
                    for f in parquet_files:
                        print("   📄", f)
                else:
                    print("⚠️ Not .parquet files found", base)

        except Exception as e:
            print("❌ Error listing bucket:", e)

except Exception as e:
    print("❌ Error:", e)





🔍 Revisando s3a://bronze/posts/
📦 2 Parquet files found inposts:
   📄 s3a://bronze/posts/2022/2022.parquet
   📄 s3a://bronze/posts/2023/2023.parquet

🔍 Revisando s3a://bronze/votes/
📦 2 Parquet files found invotes:
   📄 s3a://bronze/votes/_2022/1760206920.323453.f8f40d786f.parquet
   📄 s3a://bronze/votes/_2023/1760206962.9017577.6d111cba5f.parquet


In [8]:
posts_paths = list_parquet_files("s3a://bronze/posts/")
votes_paths = list_parquet_files("s3a://bronze/votes/")

In [9]:
def filter_by_year(paths, year):
    pattern = re.compile(rf"[_/\.]{year}([_/\.]|$)")
    return [p for p in paths if pattern.search(p)]

votes_2022_paths = filter_by_year(votes_paths, 2022)
votes_2023_paths = filter_by_year(votes_paths, 2023)
posts_2022_paths = filter_by_year(posts_paths, 2022)
posts_2023_paths = filter_by_year(posts_paths, 2023)


In [10]:
votes_2022 = spark.read.parquet(*votes_2022_paths)
votes_2023 = spark.read.parquet(*votes_2023_paths)
posts_2022 = spark.read.parquet(*posts_2022_paths)
posts_2023 = spark.read.parquet(*posts_2023_paths)

In [11]:
def clean_posts(df):
    """
    Limpieza y enriquecimiento temporal de POSTS (Silver Layer).
    - Convierte CreationDate a timestamp y genera columnas auxiliares.
    - Añade columnas: creation_date_str, year, load_date.
    - Rellena valores nulos, normaliza tipos y elimina duplicados.
    """
    try:
        print("🧹 Cleaning & enriching POSTS dataset...")

        
        # Eliminar duplicados
        df = df.dropDuplicates(["Id"])

        # Asegurar tipos numéricos coherentes
        df = df.withColumn("Score", F.coalesce(F.col("Score").cast("long"), F.lit(0))) \
               .withColumn("ViewCount", F.coalesce(F.col("ViewCount").cast("long"), F.lit(0))) \
               .withColumn("AnswerCount", F.coalesce(F.col("AnswerCount").cast("long"), F.lit(0))) \
               .withColumn("CommentCount", F.coalesce(F.col("CommentCount").cast("long"), F.lit(0))) \
               .withColumn("FavoriteCount", F.coalesce(F.col("FavoriteCount").cast("long"), F.lit(0)))

        # Convertir 'Body' a string y reemplazar nulos por vacío
        df = df.withColumn("Body", F.col("Body").cast("string"))
        df = df.withColumn("Body", F.when(F.col("Body").isNull(), F.lit("")).otherwise(F.col("Body")))

        # Convertir fechas
        date_columns = [
            "CreationDate", "LastEditDate", "LastActivityDate",
            "CommunityOwnedDate", "ClosedDate"
        ]
        for col in date_columns:
            if col in df.columns:
                df = df.withColumn(col, F.to_timestamp(F.col(col)))

        # Agregar columnas de fecha derivadas
        df = df.withColumn("creation_date", F.col("CreationDate")) \
               .withColumn("creation_date_str", F.date_format(F.col("creation_date"), "yyyy-MM-dd")) \
               .withColumn("year", F.year(F.col("creation_date")))

        print("✅ POSTS cleaning & enrichment complete.\n")
        return df

    except Exception as e:
        print(f"❌ Error cleaning POSTS: {e}")
        return df


In [12]:
def clean_votes(df):
    """
    Cleans and enriches the VOTES dataset for the Silver layer.
    Includes timestamp normalization, temporal enrichment, and null handling.
    """
    try:
        print("🧹 Cleaning & enriching VOTES dataset...")

        # Remove duplicates based on 'id'
        df = df.dropDuplicates(["id"])

        # Handle missing values
        # Replace null bounty_amount with median or 0
        if df.filter(F.col("bounty_amount").isNotNull()).count() > 0:
            median_bounty = df.approxQuantile("bounty_amount", [0.5], 0.1)[0]
        else:
            median_bounty = 0
        df = df.fillna({"bounty_amount": median_bounty})

        # Replace missing user_id or vote_type_id with -1 (unknown)
        df = df.fillna({"user_id": -1, "vote_type_id": -1, "post_id": -1})

        # Normalize timestamp and enrich with temporal columns
        df = (
            df.withColumn("creation_date", F.to_timestamp(F.col("creation_date")))
              .withColumn("creation_date_str", F.date_format(F.col("creation_date"), "yyyy-MM-dd"))
              .withColumn("year", F.year(F.col("creation_date")))
              .withColumn("load_date", F.current_timestamp())
        )

        # 4Filter invalid records
        df = df.filter(F.col("id").isNotNull() & F.col("creation_date").isNotNull())

        print("✅ VOTES cleaning & enrichment complete.\n")
        return df

    except Exception as e:
        print(f"❌ Error cleaning VOTES: {e}")
        return df


In [13]:
votes_2022_clean = clean_votes(votes_2022)

🧹 Cleaning & enriching VOTES dataset...
✅ VOTES cleaning & enrichment complete.



In [14]:
votes_2023_clean = clean_votes(votes_2023)

🧹 Cleaning & enriching VOTES dataset...
✅ VOTES cleaning & enrichment complete.



In [15]:
posts_2022_clean = clean_posts(posts_2022)

🧹 Cleaning & enriching POSTS dataset...
✅ POSTS cleaning & enrichment complete.



In [16]:
posts_2023_clean = clean_posts(posts_2023)

🧹 Cleaning & enriching POSTS dataset...
✅ POSTS cleaning & enrichment complete.



In [17]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.silver").show()

++
||
++
++



In [18]:
def safe_write_batches(df, table_name, catalog="nessie.silver", repartitions=32, batch_size=100000, max_retries=3):
    """
    Escribe un DataFrame en Nessie en batches para evitar saturar la conexión.
    """
    df = df.repartition(repartitions, "id" if "id" in df.columns else "Id")
    total_rows = df.count()
    print(f"Total rows to write: {total_rows}")

    start = 0
    while start < total_rows:
        end = start + batch_size
        batch_df = df.limit(end).subtract(df.limit(start))  # toma el batch
        for attempt in range(1, max_retries + 1):
            try:
                batch_df.writeTo(f"{catalog}.{table_name}").createOrReplace()
                print(f"✅ Batch {start}-{end} written successfully")
                break
            except Exception as e:
                print(f"⚠️ Attempt {attempt} for batch {start}-{end} failed: {e}")
                if attempt < max_retries:
                    print("⏳ Retrying in 5 seconds...")
                    time.sleep(5)
                else:
                    raise RuntimeError(f"Failed to write batch {start}-{end} after {max_retries} attempts") from e
        start = end

def merge_votes(votes_dfs, table_name="votes", catalog="nessie.silver", repartitions=32, batch_size=100000):
    """
    Merge de varios DataFrames de votes, eliminando duplicados y manteniendo último registro por Id.
    """
    if isinstance(votes_dfs, dict):
        votes_dfs = list(votes_dfs.values())

    combined = votes_dfs[0]
    for df in votes_dfs[1:]:
        combined = combined.unionByName(df, allowMissingColumns=True)

    key_col = "id" if "id" in combined.columns else "Id"
    window = Window.partitionBy(key_col).orderBy(F.desc("creation_date"))
    combined = combined.withColumn("rank", F.row_number().over(window))\
                       .filter(F.col("rank") == 1).drop("rank")

    safe_write_batches(combined, table_name, catalog=catalog, repartitions=repartitions, batch_size=batch_size)

def merge_posts(posts_dfs, table_name="posts", catalog="nessie.silver", repartitions=32, batch_size=100000):
    """
    Merge de varios DataFrames de posts, eliminando duplicados y manteniendo último registro por Id.
    """
    if isinstance(posts_dfs, dict):
        posts_dfs = list(posts_dfs.values())

    combined = posts_dfs[0]
    for df in posts_dfs[1:]:
        combined = combined.unionByName(df, allowMissingColumns=True)

    key_col = "id" if "id" in combined.columns else "Id"
    window = Window.partitionBy(key_col).orderBy(F.desc("LastActivityDate"))
    combined = combined.withColumn("rank", F.row_number().over(window))\
                       .filter(F.col("rank") == 1).drop("rank")

    safe_write_batches(combined, table_name, catalog=catalog, repartitions=repartitions, batch_size=batch_size)


In [16]:
def merge_posts(posts_dfs, table_name="posts", catalog="nessie.silver", repartitions=32):
    """
    Merge de varios DataFrames de posts, eliminando duplicados y manteniendo último registro por Id.
    - posts_dfs: lista o diccionario de DataFrames de posts (por año o periodo)
    - repartitions: número de particiones para escribir en Nessie
    """
    # Convertir diccionario a lista si es necesario
    if isinstance(posts_dfs, dict):
        posts_dfs = list(posts_dfs.values())
    
    # Combinar todos los DataFrames
    combined = posts_dfs[0]
    for df in posts_dfs[1:]:
        combined = combined.unionByName(df, allowMissingColumns=True)
    
    # Mantener solo último registro por Id usando last_activity_date
    window = Window.partitionBy("Id").orderBy(F.desc("LastActivityDate"))
    combined = combined.withColumn("rank", F.row_number().over(window))\
                       .filter(F.col("rank") == 1).drop("rank")
    
    # Reparticiona para balancear la carga
    combined = combined.repartition(repartitions, "id")
    
    # Escritura tipo MERGE en Nessie
    combined.writeTo(f"{catalog}.{table_name}").createOrReplace()
    print(f"✅ Posts merged and written to {catalog}.{table_name}")

In [17]:
posts_dfs = {
    "posts_2022": posts_2022_clean,
    "posts_2023": posts_2023_clean,
}

In [18]:
merge_posts(posts_dfs, batch_size=1000)

Total rows to write: 5082178


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


⚠️ Attempt 1 for batch 0-1000 failed: An error occurred while calling o475.createOrReplace
⏳ Retrying in 5 seconds...
⚠️ Attempt 2 for batch 0-1000 failed: [Errno 111] Connection refused
⏳ Retrying in 5 seconds...
⚠️ Attempt 3 for batch 0-1000 failed: [Errno 111] Connection refused


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


RuntimeError: Failed to write batch 0-1000 after 3 attempts

In [17]:
def merge_votes(votes_dfs, table_name="votes", catalog="nessie.silver", repartitions=32):
    """
    Merge de varios DataFrames de votes, eliminando duplicados y manteniendo último registro por Id.
    - votes_dfs: lista o diccionario de DataFrames de votes (por año o periodo)
    - repartitions: número de particiones para escribir en Nessie
    """
    # Convertir diccionario a lista si es necesario
    if isinstance(votes_dfs, dict):
        votes_dfs = list(votes_dfs.values())
    
    # Combinar todos los DataFrames
    combined = votes_dfs[0]
    for df in votes_dfs[1:]:
        combined = combined.unionByName(df, allowMissingColumns=True)
    
    # Mantener solo último registro por Id usando creation_date
    if "id" in combined.columns:
        window = Window.partitionBy("id").orderBy(F.desc("creation_date"))
        combined = combined.withColumn("rank", F.row_number().over(window))\
                           .filter(F.col("rank") == 1).drop("rank")
    elif "Id" in combined.columns:
        window = Window.partitionBy("Id").orderBy(F.desc("creation_date"))
        combined = combined.withColumn("rank", F.row_number().over(window))\
                           .filter(F.col("rank") == 1).drop("rank")
    
    # Reparticiona para balancear la carga
    combined = combined.repartition(repartitions, "id" if "id" in combined.columns else "Id")
    
    # Escritura tipo MERGE en Nessie
    combined.writeTo(f"{catalog}.{table_name}").createOrReplace()
    print(f"✅ Votes merged and written to {catalog}.{table_name}")


In [19]:
votes_dfs = {
    "votes_2022": votes_2022_clean,
    "votes_2023": votes_2023_clean
}

In [21]:
merge_votes(votes_dfs, batch_size=100)

Total rows to write: 27061227


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 55090)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317,

⚠️ Attempt 1 for batch 0-100 failed: An error occurred while calling o495.createOrReplace
⏳ Retrying in 5 seconds...
⚠️ Attempt 2 for batch 0-100 failed: [Errno 111] Connection refused
⏳ Retrying in 5 seconds...
⚠️ Attempt 3 for batch 0-100 failed: [Errno 111] Connection refused


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


RuntimeError: Failed to write batch 0-100 after 3 attempts

In [None]:
spark.stop()

In [None]:
for df_name, df in votes_dfs.items():
    print(df_name)
    df.printSchema()
