In [21]:
import os, posixpath
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
from azure.storage.blob import BlobServiceClient

PROFILE   = "s3_to_adls"  # el nombre que usaste en secrets.toml

src_cfg   = dlt.secrets[f"{PROFILE}.sources.filesystem"]
src_creds = dlt.secrets[f"{PROFILE}.sources.filesystem.credentials"]

# --- MinIO (ya montado por compose) ---
MINIO_ENDPOINT   = src_creds["endpoint_url"]
MINIO_ACCESS_KEY = src_creds["aws_access_key_id"]
MINIO_SECRET_KEY = src_creds["aws_secret_access_key"]
MINIO_BUCKET     = src_cfg["bucket_url"].replace("s3://", "").split("/", 1)[0]

# Prefijo de la tabla Iceberg
SOURCE_PREFIX = "taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2"


In [22]:
# Lista todos los archivos .parquet en el prefijo de la tabla Iceberg
def iter_s3_objects(bucket: str, prefix: str, page_size: int = 1000):
    token = None
    while True:
        kw = {"Bucket": bucket, "Prefix": prefix, "MaxKeys": page_size}
        if token:
            kw["ContinuationToken"] = token
        resp = s3.list_objects_v2(**kw)
        for obj in resp.get("Contents", []):
            yield obj["Key"], obj.get("Size", 0)
        if not resp.get("IsTruncated"):
            break
        token = resp.get("NextContinuationToken")

# Lista los objetos en MinIO bajo el prefijo de la tabla
keys = [(key, size) for key, size in iter_s3_objects(MINIO_BUCKET, SOURCE_PREFIX) if key.endswith(".parquet")]
print(f"Archivos Parquet encontrados en s3://{MINIO_BUCKET}/{SOURCE_PREFIX}: {len(keys)}")

for k, sz in keys[:10]:
    print(f"- {k} ({sz/1024:.1f} KiB)")


Archivos Parquet encontrados en s3://my-bucket/taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2: 1
- taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2/data/00000-0-0374a941-931f-45e7-a8e6-c6546b330357.parquet (60161.3 KiB)


In [23]:
def list_minio_objects():
    objects = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix="taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2")
    for obj in objects.get("Contents", []):
        print(f"Objeto encontrado: {obj['Key']}")

list_minio_objects()


Objeto encontrado: taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2/data/00000-0-0374a941-931f-45e7-a8e6-c6546b330357.parquet
Objeto encontrado: taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2/metadata/00000-3c8addf3-5402-4848-8ea9-9b404e7f34a2.metadata.json
Objeto encontrado: taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2/metadata/00000-9a4538b6-78d4-4165-b2fe-e9188b57dfbe.metadata.json
Objeto encontrado: taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2/metadata/0374a941-931f-45e7-a8e6-c6546b330357-m0.avro
Objeto encontrado: taxis/taxis_iceberg_2ad9c723-20f8-4b50-9fbb-08941abc60a2/metadata/snap-4205361071682720890-0-0374a941-931f-45e7-a8e6-c6546b330357.avro


In [25]:
# --- Azure cliente ---
creds = dlt.secrets["s3_to_adls.destination.filesystem.credentials"]
account_name = creds["azure_storage_account_name"]
account_key  = creds["azure_storage_account_key"]

blob_service = BlobServiceClient(
    account_url=f"https://{account_name}.blob.core.windows.net",
    credential=account_key
)

# --- contenedor de la clase para Azure ---
dest_info = dlt.secrets["s3_to_adls.destination.filesystem"]
container_name = dest_info["bucket_url"].split("@")[0].replace("abfss://", "")
container = blob_service.get_container_client(container_name)

# Copiar todos los archivos de MinIO a Azure
copiados = 0
for key, _ in tqdm(keys, desc="Copiando tabla Iceberg a Azure"):
    dest_key = f"Grupo1/{key}"  # Guardamos bajo la carpeta Grupo1 para que se remplace con al del profe
    obj = s3.get_object(Bucket=MINIO_BUCKET, Key=key)
    container.upload_blob(name=dest_key, data=obj["Body"], overwrite=True)
    copiados += 1

print(f"Copiados!!! {copiados} archivos de MinIO a Azure en {container_name}/Equipo1/")


Copiando tabla Iceberg a Azure: 100%|██████████| 1/1 [00:14<00:00, 14.92s/it]

✅ Copiados 1 archivos de MinIO a Azure en clase-4-dlt/Equipo1/



