In [14]:
import pandas as pd
import re
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, explode, from_json, col, current_date,current_timestamp, lit, to_date, month

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, StructType, StructField, ArrayType


from pyspark.sql import SparkSession
import pyspark

In [15]:
CATALOG_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3a://gold/"              
STORAGE_URI = "http://minio:9000"
AWS_ACCESS_KEY = "admin"
AWS_SECRET_KEY = "password"

In [16]:
conf = (
    pyspark.SparkConf()
        .setAppName('silver_transform')

        # 📦 Dependencias necesarias
        .set("spark.jars.packages", ",".join([
            "org.postgresql:postgresql:42.7.3",
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0",
            "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1",
            "software.amazon.awssdk:bundle:2.24.8",
            "software.amazon.awssdk:url-connection-client:2.24.8",
            "org.apache.hadoop:hadoop-aws:3.3.4"
        ]))

        # 🧩 Extensiones Iceberg + Nessie
        .set("spark.sql.extensions", ",".join([
            "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
            "org.projectnessie.spark.extensions.NessieSparkSessionExtensions"
        ]))

        # 🗂️ Catálogo Nessie
        .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
        .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
        .set("spark.sql.catalog.nessie.uri", CATALOG_URI)
        .set("spark.sql.catalog.nessie.ref", "main")
        .set("spark.sql.catalog.nessie.authentication.type", "NONE")
        .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
        .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)

        # ☁️ Configuración S3A para MinIO
        .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .set("spark.hadoop.fs.s3a.endpoint", STORAGE_URI)
        .set("spark.hadoop.fs.s3a.path.style.access", "true")
        .set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
        .set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
        .set("spark.hadoop.fs.s3a.aws.credentials.provider",
             "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
        .set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

        # ⚡ Optimizaciones de ejecución
        .set("spark.sql.execution.arrow.pyspark.enabled", "true")
        .set("spark.sql.parquet.filterPushdown", "true")
        .set("spark.sql.parquet.mergeSchema", "false")
        .set("spark.sql.shuffle.partitions", "64")  # 🔧 más particiones para distribuir carga
        .set("spark.sql.files.maxPartitionBytes", "64MB")  # ⚖️ reduce tamaño de tarea para evitar saturación

        .set("spark.driver.memory", "5g")                     # Driver usa hasta 5 GB (de los 6g disponibles)
        .set("spark.executor.memory", "6g")                   # Cada executor usa hasta 6 GB (de los 8g disponibles)
        .set("spark.executor.cores", "4")                     # Más núcleos por executor para paralelismo
        .set("spark.driver.maxResultSize", "2g")              # Aumenta el límite de resultados del driver
        .set("spark.network.timeout", "600s")                 # Timeout más generoso para cargas grandes
        .set("spark.executor.heartbeatInterval", "60s")       # Latido coherente con el timeout
        
        # ⚙️ Escritura
        .set("spark.sql.parquet.compression.codec", "snappy")

)

In [17]:
spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()
print("Spark Session Started")

Spark Session Started


In [18]:
posts_df = spark.read.table("nessie.silver.posts")
votes_df = spark.read.table("nessie.silver.votes")

In [19]:
posts_df.createOrReplaceTempView("posts_view")
votes_df.createOrReplaceTempView("votes_view")

In [20]:
vote_stats_per_post = spark.sql("""
    WITH valid_votes AS (
        SELECT
            v.post_id_clean AS post_id,
            v.vote_type_id_clean,
            v.creation_ts,
            v.creation_year,
            DATE_FORMAT(v.creation_ts, 'yyyy-MM') AS year_month
        FROM votes_view v
        WHERE v.post_id_clean IS NOT NULL AND v.post_id_clean != -1
              AND v.vote_type_id_clean != -1
    ),
    vote_summary AS (
        SELECT
            post_id,
            YEAR(creation_ts) AS year,
            MONTH(creation_ts) AS month,
            SUM(CASE WHEN vote_type_id_clean IN (2, 18, 20, 21, 32) THEN 1 ELSE 0 END) AS upvotes,
            SUM(CASE WHEN vote_type_id_clean IN (3, 33) THEN 1 ELSE 0 END) AS downvotes,
            COUNT(*) AS total_votes
        FROM valid_votes
        GROUP BY post_id, YEAR(creation_ts), MONTH(creation_ts)
    ),
    posts_enriched AS (
        SELECT
            p.Id AS post_id,
            p.Score AS base_score,
            p.ViewCount,
            p.AnswerCount,
            p.CommentCount,
            p.FavoriteCount,
            p.PostTypeId,
            p.CreationDate,
            p.LastActivityDate,
            CASE p.PostTypeId
                WHEN 1 THEN 'Question'
                WHEN 2 THEN 'Answer'
                WHEN 3 THEN 'Orphaned Tag Wiki'
                WHEN 4 THEN 'Tag Wiki Excerpt'
                WHEN 5 THEN 'Tag Wiki'
                WHEN 6 THEN 'Moderator Nomination'
                WHEN 7 THEN 'Wiki Placeholder'
                WHEN 8 THEN 'Privilege Wiki'
                WHEN 9 THEN 'Article'
                WHEN 10 THEN 'Help Article'
                WHEN 12 THEN 'Collection'
                WHEN 13 THEN 'Moderator Questionnaire Response'
                WHEN 14 THEN 'Announcement'
                WHEN 15 THEN 'Collective Discussion'
                WHEN 17 THEN 'Collective Collection'
                ELSE 'Other'
            END AS post_type
        FROM posts_view p
    )
    SELECT
        ps.post_id,
        ps.post_type,
        vs.year,
        vs.month,
        ps.ViewCount,
        ps.AnswerCount,
        ps.CommentCount,
        ps.FavoriteCount,
        COALESCE(vs.upvotes, 0) AS upvotes,
        COALESCE(vs.downvotes, 0) AS downvotes,
        COALESCE(vs.total_votes, 0) AS total_votes,
        (ps.base_score + COALESCE(vs.upvotes, 0) - COALESCE(vs.downvotes, 0)) AS score,
        ROUND(
            CASE WHEN vs.total_votes > 0 THEN vs.upvotes / vs.total_votes ELSE 0 END, 3
        ) AS upvote_pct,
        ROUND(
            CASE WHEN vs.total_votes > 0 THEN vs.downvotes / vs.total_votes ELSE 0 END, 3
        ) AS downvote_pct,
        CURRENT_TIMESTAMP() AS load_date
    FROM posts_enriched ps
    LEFT JOIN vote_summary vs
        ON ps.post_id = vs.post_id
    WHERE vs.year = 2023 AND vs.month IS NOT NULL
""")

In [21]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.gold")
print("✅ Namespace nessie.gold creado si no existía.")

✅ Namespace nessie.gold creado si no existía.


In [26]:
# Registrar como vista temporal para el MERGE
vote_stats_per_post.createOrReplaceTempView("vote_stats_updates")
table_exists = spark.catalog.tableExists(table_path)

In [29]:
if not table_exists:
    print(f"⚙️ La tabla {table_path} no existe. Creándola...")
    vote_stats_per_post.writeTo(table_path).create()
    print(f"✅ Tabla {table_path} creada exitosamente.")
else:
    print(f"Fusionando datos en la tabla existente {table_path}...")
    spark.sql(f"""
        MERGE INTO {table_path} AS target
        USING vote_stats_updates AS source
        ON target.post_id = source.post_id 
           AND target.year = source.year 
           AND target.month = source.month
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """)
    print(f"✅ Datos fusionados exitosamente en {table_path} usando MERGE de Iceberg con granularidad por fecha.")

Fusionando datos en la tabla existente nessie.gold.vote_stats_per_post...
✅ Datos fusionados exitosamente en nessie.gold.vote_stats_per_post usando MERGE de Iceberg con granularidad por fecha.


In [30]:
# Opcional: Verificar el esquema o conteo
vote_stats_per_post.printSchema()
vote_stats_per_post.count()

root
 |-- post_id: long (nullable = true)
 |-- post_type: string (nullable = false)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- ViewCount: long (nullable = true)
 |-- AnswerCount: long (nullable = true)
 |-- CommentCount: long (nullable = true)
 |-- FavoriteCount: long (nullable = true)
 |-- upvotes: long (nullable = false)
 |-- downvotes: long (nullable = false)
 |-- total_votes: long (nullable = false)
 |-- score: long (nullable = true)
 |-- upvote_pct: double (nullable = true)
 |-- downvote_pct: double (nullable = true)
 |-- load_date: timestamp (nullable = false)



493812

In [31]:
vote_stats_per_post.show(5)

+--------+---------+----+-----+---------+-----------+------------+-------------+-------+---------+-----------+-----+----------+------------+--------------------+
| post_id|post_type|year|month|ViewCount|AnswerCount|CommentCount|FavoriteCount|upvotes|downvotes|total_votes|score|upvote_pct|downvote_pct|           load_date|
+--------+---------+----+-----+---------+-----------+------------+-------------+-------+---------+-----------+-----+----------+------------+--------------------+
|70547323| Question|2023|    3|      399|          0|           1|            0|      2|        0|          2|    5|       1.0|         0.0|2025-10-15 04:29:...|
|70547323| Question|2023|    1|      399|          0|           1|            0|      1|        0|          1|    4|       1.0|         0.0|2025-10-15 04:29:...|
|70547373| Question|2023|    3|     4694|          2|           0|            0|      1|        0|          1|    3|       1.0|         0.0|2025-10-15 04:29:...|
|70547437|   Answer|2023|   

In [32]:
spark.stop()