# Desarrollo de la Capa Bronze - Lakehouse StackOverflow

### Criterios Bronze
- Datos crudos en Parquet.
- Sobrescritura permitida.
- Datos de 2020 y 2021.
- Namespace: `bronze`.
- Warehouse: `s3a://warehouse` en MinIO.

**Fuentes de Datos** (Parquet crudos):
- Posts: 2020/2021
- Votes: 2020/2021
- Comments: 2020/2021
- PostHistory: 2020/2021
- Users, Badges, PostLinks: Completos.

In [None]:
from pyspark.sql import SparkSession

MINIO_ENDPOINT = "http://minio:9000"
AWS_ACCESS_KEY = "admin"
AWS_SECRET_KEY = "password"

spark = (
    SparkSession.builder
    .appName("BronzeIngestion-StackOverflow-Parquet")
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)

print("✅ SparkSession creada exitosamente para Parquet en MinIO")
print(f"Versión de Spark: {spark.version}")

In [None]:
import os

def ingest_to_bronze_wget(table_name, base_url):
    """
    Descarga un archivo parquet desde HTTP (si no existe lo sobrescribe)
    y lo deja en la carpeta temporal /tmp/stackoverflow/<table_name>.
    """
    local_dir = f"/tmp/stackoverflow/{table_name}"
    os.makedirs(local_dir, exist_ok=True)

    # Descargar el parquet con el mismo nombre que la tabla
    local_file = f"{local_dir}/{table_name}.parquet"
    os.system(f"wget -O {local_file} {base_url}")

    print(f"✅ Archivo descargado: {local_file}")
    return local_file


def ingest_to_bronze(table_name):
    """
    Lee el parquet descargado y lo guarda en el bucket bronze en formato parquet.
    """
    local_file = f"/tmp/stackoverflow/{table_name}/{table_name}.parquet"

    # Leer con Spark
    df = spark.read.parquet(f"file://{local_file}")

    # Guardar en MinIO (bucket bronze)
    output_path = f"s3a://bronze/{table_name}"
    df.write.mode("overwrite").parquet(output_path)

    print(f"✅ Ingesta de {table_name} completada en {output_path}")
    return df


In [None]:
postlinks = ingest_to_bronze_wget(
    "postlinks",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/postlinks.parquet"
)

badges = ingest_to_bronze_wget(
    "badges",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/badges.parquet"
)

users = ingest_to_bronze_wget(
    "users",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/users.parquet"
)

In [None]:
postlinks = ingest_to_bronze(
    "postlinks"
)

badges = ingest_to_bronze(
    "badges"
)

users = ingest_to_bronze(
    "users"
)

In [None]:
posthistory_2020 = ingest_to_bronze_wget(
    "posthistory_2020",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posthistory/2020.parquet"
)

posthistory_2021 = ingest_to_bronze_wget(
    "posthistory_2021",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posthistory/2021.parquet"
)

comments_2020 = ingest_to_bronze_wget(
    "comments_2020",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/comments/2020.parquet"
)

comments_2021 = ingest_to_bronze_wget(
    "comments_2021",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/comments/2021.parquet"
)

votes_2020 = ingest_to_bronze_wget(
    "votes_2020",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2020.parquet"
)

votes_2021 = ingest_to_bronze_wget(
    "votes_2021",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2021.parquet"
)

posts_2020 = ingest_to_bronze_wget(
    "posts_2020",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet"
)

posts_2021 = ingest_to_bronze_wget(
    "posts_2021",
    "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/20201.parquet"
)

In [None]:
posthistory_2020 = ingest_to_bronze(
    "posthistory_2021"
)

posthistory_2021 = ingest_to_bronze(
    "postlinks"
)

comments_2020 = ingest_to_bronze(
    "comments_2020"
)

comments_2021 = ingest_to_bronze(
    "comments_2021"
)

votes_2020 = ingest_to_bronze(
    "votes_2020"
)

votes_2021 = ingest_to_bronze(
    "votes_2021"
)

posts_2020 = ingest_to_bronze(
    "posts_2020"
)

posts_2021 = ingest_to_bronze(
    "posts_2021"
)