# 3. Minio-to-Azure

This notebook outlines the process of moving a Parquet file, managed as an Iceberg table using the Nessie catalog, from the `grupo-2` bucket in MinIO to a group-specific folder in Azure storage. The workflow employs `dlt` and Iceberg libraries to facilitate the transfer, ensuring data integrity and compatibility with Azure’s storage system. This requires MinIO access, Azure credentials, and the Iceberg library installation.

In [9]:
%pip install pandas pyarrow fsspec s3fs adlfs dlt[filesystem] toml

Note: you may need to restart the kernel to use updated packages.


In [1]:
# General utilities
import os
import toml
import logging


# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs

#PyIceberg: Table format
from pyiceberg.catalog import load_catalog

# Azure Blolb Storage Verification
from adlfs import AzureBlobFileSystem

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("minio_to_azure")

In [3]:
# Load config file
config = toml.load("/home/jovyan/work/.dlt/secrets.toml")

# Extract MinIO credentials
creds = config["sources"]["credentials"]

# Export env var
os.environ["AWS_ACCESS_KEY_ID"] = creds["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws_secret_access_key"]
os.environ["AWS_ENDPOINT_URL"] = creds.get("endpoint_url", "") 

In [4]:
pipeline_config = {
    "pipeline_name": "s3_to_adls",
    "destination": "filesystem",
    "dataset_name": "grupo_2_parquet",
}

try:
    pipeline = dlt.pipeline(**pipeline_config)
    logger.info("Pipeline configured successfully.")
except Exception as e:
    logger.error(f"Error configuring the pipeline: {e}")
    raise

2025-09-11 02:36:28,608 - INFO - Pipeline configured successfully.


In [5]:
@dlt.resource(name="parquet_files", write_disposition="append")
def parquet_files(namespace: str = "proyecto", table_name: str = "grupo2"):
    catalog = load_catalog(
        "nessie",
        uri="http://nessie:19120/iceberg/",
        type="rest"
    )
    table = catalog.load_table(f"{namespace}.{table_name}")
    scan = table.scan()
    file_tasks = scan.plan_files()

    for task in file_tasks:
        file_path = task.file.file_path
        try:
            table = pq.read_table(file_path)
            for record in table.to_pylist():
                yield record
        except Exception as e:
            logger.error(f"Error reading file {file_path}: {e}")

In [6]:
data = parquet_files(namespace="proyecto", table_name="grupo2")

In [7]:
try:
    load_info = pipeline.run(data)
    logger.info(f"Pipeline sucessfully executed. Load info: {load_info}")
except Exception as e:
    logger.error(f"Error while executing pipeline: {e}")
    raise

2025-09-11 02:36:36,152 - INFO - Request URL: 'https://fhbd.blob.core.windows.net/clase-4-dlt/GRUPO_2/grupo_2_parquet/_dlt_pipeline_state'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'adlfs/2025.8.0 azsdk-python-storage-blob/12.26.0 Python/3.11.6 (Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '230ad870-8eb8-11f0-97ee-527ab615ffb2'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-09-11 02:36:37,652 - INFO - Response status: 404
Response headers:
    'Transfer-Encoding': 'chunked'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '8b51f156-201e-005e-76c4-22d2da000000'
    'x-ms-client-request-id': '230ad870-8eb8-11f0-97ee-527ab615ffb2'
    'x-ms-version': 'REDACTED'
    'x-ms-error-code': 'BlobNotFound'
    'Date': 'Thu, 11 Sep 2025 02:36:37 GMT'
2025-09-11 02:36:37,656 - INFO - Re

In [8]:
# Extract Azure credentials 
creds_2 = config["s3_to_adls"]["destination"]["filesystem"]["credentials"]

# Export env var
account_name = os.environ["AZURE_STORAGE_ACCOUNT_NAME"] = creds_2["azure_storage_account_name"]
account_key = os.environ["AZURE_STORAGE_ACCOUNT_KEY"] = creds_2["azure_storage_account_key"]

In [10]:
# Initialize the Azure file system
fs = AzureBlobFileSystem(account_name=account_name, account_key=account_key)

# Logical path within the container
path = "clase-4-dlt/GRUPO_2/grupo_2_parquet"

# List files
files = fs.ls(path)

# Show results
for f in files:
    print(f)

2025-09-11 02:53:58,448 - INFO - Request URL: 'https://fhbd.blob.core.windows.net/clase-4-dlt?restype=REDACTED&comp=REDACTED&prefix=REDACTED&delimiter=REDACTED&include=REDACTED'
Request method: 'GET'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'adlfs/2025.8.0 azsdk-python-storage-blob/12.26.0 Python/3.11.6 (Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '904c7982-8eba-11f0-97ee-527ab615ffb2'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-09-11 02:53:58,827 - INFO - Response status: 200
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/xml'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '7722b260-e01e-0041-1cc7-2261de000000'
    'x-ms-client-request-id': '904c7982-8eba-11f0-97ee-527ab615ffb2'
    'x-ms-version': 'REDACTED'
    'Date': 'Thu, 11 Sep 2025 02:53:58 GM

clase-4-dlt/GRUPO_2/grupo_2_parquet/_dlt_loads
clase-4-dlt/GRUPO_2/grupo_2_parquet/_dlt_pipeline_state
clase-4-dlt/GRUPO_2/grupo_2_parquet/_dlt_version
clase-4-dlt/GRUPO_2/grupo_2_parquet/init
clase-4-dlt/GRUPO_2/grupo_2_parquet/parquet_files
