# 4. Minio-to-Clickhouse

This notebook extracts the Parquet file from an Iceberg table tracked via the Nessie catalog and stored in MinIO object storage. The file is then ingested into ClickHouse for analytical querying. 

In [2]:
%pip install pandas pyarrow fsspec dlt[clickhouse] s3fs adlfs pyiceberg[s3fs,sql-sqlite] toml

Note: you may need to restart the kernel to use updated packages.


In [3]:
# General utilities
import os
import toml
import logging

# Data manipulation
import pandas as pd

# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem, read_parquet

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs
from pyiceberg.catalog import load_catalog

In [4]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("minio_to_clickhouse")

In [6]:
# Load config file
config = toml.load("/home/jovyan/work/.dlt/secrets.toml")

# Extract credentials
creds = config["sources"]["credentials"]

# Export env var
os.environ["AWS_ACCESS_KEY_ID"] = creds["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws_secret_access_key"]
os.environ["AWS_ENDPOINT_URL"] = creds.get("endpoint_url", "")

In [7]:
@dlt.resource(name="parquet_files", write_disposition="append")
def parquet_files(namespace: str = "proyecto", table_name: str = "grupo2"):
    catalog = load_catalog(
        "nessie",
        uri="http://nessie:19120/iceberg/",
        type="rest"
    )
    table = catalog.load_table(f"{namespace}.{table_name}")
    scan = table.scan()
    file_tasks = scan.plan_files()

    for task in file_tasks:
        file_path = task.file.file_path
        try:
            table = pq.read_table(file_path)
            for record in table.to_pylist():
                yield record
        except Exception as e:
            logger.error(f"Error reading file {file_path}: {e}")


pipeline = dlt.pipeline(
    pipeline_name="iceberg_to_clickhouse",
    destination="clickhouse",
    dataset_name="grupo2_proyecto"
)

In [8]:
data = parquet_files(namespace="proyecto", table_name="grupo2")

In [9]:
try:
    load_info = pipeline.run(data)
    logger.info(f"Pipeline sucessfully executed. Load info: {load_info}")
except Exception as e:
    logger.error(f"Error while executing pipeline: {e}")
    raise

2025-09-12 16:03:36,537 - INFO - Pipeline sucessfully executed. Load info: Pipeline iceberg_to_clickhouse load step completed in 5.18 seconds
1 load package(s) were loaded to destination clickhouse and into dataset grupo2_proyecto
The clickhouse destination used clickhouse://admin_2:***@clickhouse:9000/default location to store data
Load package 1757692075.2661552 is LOADED and contains no failed jobs
