In [5]:
!pip install --upgrade s3fs



In [None]:
!pip install dlt
!pip install minio
!pip install --upgrade pyarrow
!pip install --upgrade dlt
!pip install --user "dlt[s3]"

In [None]:
import boto3
from botocore.exceptions import EndpointConnectionError, ClientError

# Aquí define tus credenciales para revisar si conecta Jupyter con Minio. ¡Importante hacer esto para estar seguro de tus rutas!
ENDPOINT = "http://minio:9000"
ACCESS   = "minioadmin"
SECRET   = "minioadmin"

try:
    s3 = boto3.client("s3",
                      endpoint_url=ENDPOINT,
                      aws_access_key_id=ACCESS,
                      aws_secret_access_key=SECRET)
    response = s3.list_buckets()
    response = s3.list_buckets()
    print("Conectado correctamente a MinIO")
    print("Buckets encontrados:", [b['Name'] for b in response.get('Buckets', [])])
except EndpointConnectionError as e:
    print("¡ERROR! No se pudo conectar al endpoint:", e)
except ClientError as e:
    print("¡OJO! Error de cliente S3:", e)
except Exception as e:
    print("Otro error:", e)


In [None]:
import os
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000", 
    aws_access_key_id="minioadmin",
    aws_secret_access_key="minioadmin"
)

bucket = "lakehouse"
base_path = "/data/StackOverflowData"  # carpeta dentro DEL CONTENEDOR JUPYTER

# Verificación por si la ruta no exite.
if not os.path.exists(base_path):
    raise FileNotFoundError(f"No se encuentra la ruta {base_path}. Verifica el volumen en docker-compose.")

# Crear bronze por si no existe.
prefix = "bronze/"
try:
    s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=1)
except ClientError as e:
    print("Error al acceder al bucket:", e)

# Subir todo lo que sea .parquet! AVISO: Tarda su tiempo todo depende de los parquets seleccionados.
EXCLUIR = {"Comments2021.parquet"} 

files = [f for f in os.listdir(base_path) if f.endswith(".parquet")]
print(f"Encontrados {len(files)} archivos parquet en {base_path}")

for i, file in enumerate(files, 1):
    if file in EXCLUIR:
        print(f"[{i}/{len(files)}] Saltando {file} (se cargará con DLT)")
        continue  # pasa al siguiente archivo

    src = os.path.join(base_path, file)
    dest = f"{prefix}{file}"
    print(f"[{i}/{len(files)}] Subiendo {file} a MinIO ...", end=" ")
    s3.upload_file(src, bucket, dest)
    print("LISTO!")

print("Ingesta Bronze completada.")

In [3]:
import os
import requests
import tempfile
import pyarrow.parquet as pq
import dlt
import boto3
from botocore.exceptions import ClientError

URL = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/comments/2021.parquet"

# MinIO (ajusta si corres fuera del contenedor)
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "http://minio:9000")
MINIO_ACCESS   = os.environ.get("MINIO_ACCESS_KEY_ID", "minioadmin")
MINIO_SECRET   = os.environ.get("MINIO_SECRET_ACCESS_KEY", "minioadmin")
BUCKET         = os.environ.get("MINIO_BUCKET", "lakehouse")
PREFIX         = "bronze"   # destino lógico en Bronze

# DLT: rutas y opciones
PIPELINE_NAME  = "bronze_ingest_dlt_comments2021"
DATASET_NAME   = "comments_2021"          # nombre lógico del dataset en filesystem (no afecta MinIO)
PIPELINES_DIR  = "/workspace/.dlt"      # estado de dlt en disco


@dlt.resource(name="comments_2021", write_disposition="replace")
def comments_2021_arrow():
    """Descarga el Parquet remoto y lo entrega como Arrow Table (streaming seguro)."""
    print(f"Descargando: {URL}")
    with requests.get(URL, stream=True) as r:
        r.raise_for_status()
        with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    tmp.write(chunk)
            tmp.flush()
            table = pq.read_table(tmp.name)  # pyarrow.Table
            print(f"Arrow OK: {table.num_rows} filas, {table.num_columns} columnas")
            # DLT acepta pyarrow.Table; si tu versión no, descomenta el yield de dicts:
            # for rec in table.to_pylist(): yield rec
            yield table


def ensure_bucket(s3, bucket: str):
    try:
        s3.head_bucket(Bucket=bucket)
        print(f"Bucket '{bucket}' existe.")
    except ClientError:
        s3.create_bucket(Bucket=bucket)
        print(f"Bucket '{bucket}' creado.")

def upload_folder_to_minio(local_folder: str, bucket: str, prefix: str):
    """Sube todo el contenido generado por DLT (parquet/metadata) bajo bronze/posts/2021/"""
    s3 = boto3.client(
        "s3",
        endpoint_url=MINIO_ENDPOINT,
        aws_access_key_id=MINIO_ACCESS,
        aws_secret_access_key=MINIO_SECRET,
    )
    ensure_bucket(s3, bucket)


    if not prefix.endswith("/"):
        prefix += "/"
    s3.put_object(Bucket=bucket, Key=prefix)

    uploaded = 0
    for root, _, files in os.walk(local_folder):
        for fname in files:
            local_path = os.path.join(root, fname)
            # clave relativa (conserva estructura de salida de DLT)
            rel = os.path.relpath(local_path, local_folder).replace("\\", "/")
            s3_key = f"{prefix}{rel}"
            print(f"{local_path} → s3://{bucket}/{s3_key}")
            s3.upload_file(local_path, bucket, s3_key)
            uploaded += 1
    print(f"Subidos {uploaded} archivos a s3://{bucket}/{prefix}")


def main():
    pipeline = dlt.pipeline(
        pipeline_name=PIPELINE_NAME,
        destination="filesystem",           # primero a disco
        dataset_name=DATASET_NAME,
        pipelines_dir=PIPELINES_DIR,
        dev_mode=True,
    )

    print("Iniciando ingesta DLT (posts 2021)...")
    load_info = pipeline.run(
        comments_2021_arrow(),
        loader_file_format="parquet",       # fuerza parquet en filesystem
    )
    print("DLT finalizado.")
    print(load_info)

    # 2) Subir a MinIO → lakehouse/bronze/posts/2021/
    output_dir = pipeline.dataset_path()    # carpeta donde DLT escribió parquet
    print(f"Output local DLT: {output_dir}")
    upload_folder_to_minio(output_dir, BUCKET, PREFIX)
    print("Proceso completado.")


if __name__ == "__main__":
    main()


Iniciando ingesta DLT (posts 2021)...
Descargando: https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/comments/2021.parquet
Arrow OK: 5536558 filas, 7 columnas


PipelineStepFailed: Pipeline execution failed at `step=load` when processing package with `load_id=1760031133.0536427` with exception:

<class 'PermissionError'>
Forbidden