# 3. Minio-to-Clickhouse

This notebook extracts the Parquet file from an Iceberg table tracked via the Nessie catalog and stored in MinIO object storage. The file is then ingested into ClickHouse for analytical querying. 

In [1]:
%pip install pandas pyarrow fsspec dlt[clickhouse] s3fs adlfs pyiceberg[s3fs,sql-sqlite] toml clickhouse-connect

Collecting clickhouse-connect
  Downloading clickhouse_connect-0.8.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting clickhouse-driver>=0.2.7 (from dlt[clickhouse])
  Downloading clickhouse_driver-0.2.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.1 kB)
Collecting gcsfs>=2022.4.0 (from dlt[clickhouse])
  Downloading gcsfs-2025.9.0-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting lz4 (from clickhouse-connect)
  Downloading lz4-4.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tzlocal (from clickhouse-driver>=0.2.7->dlt[clickhouse])
  Downloading tzlocal-5.3.1-py3-none-any.whl.metadata (7.6 kB)
Collecting google-auth>=1.2 (from gcsfs>=2022.4.0->dlt[clickhouse])
  Downloading google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-auth-oauthlib (from gcsfs>=2022.4.0->dlt[clickhouse])
  Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)


In [7]:
# General utilities
import os
import toml
import logging
from typing import Optional

# Data manipulation
import pandas as pd

# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem, read_parquet

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs

# PyIceberg
from pyiceberg.catalog import load_catalog
from pyiceberg.table import Table
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import (
    BooleanType, IntegerType, LongType, FloatType, DoubleType,
    StringType, TimestampType, DateType
)

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("minio_to_clickhouse")

In [35]:
config = toml.load("/home/jovyan/work/.dlt/secrets.toml")

creds = config["parquet_to_minio"]["destination"]["filesystem"]["credentials"]

os.environ["AWS_ACCESS_KEY_ID"] = creds["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws_secret_access_key"]
os.environ["AWS_ENDPOINT_URL"] = creds.get("endpoint_url", "")

In [36]:
@dlt.resource(name="parquet_files", write_disposition="append")
def parquet_files(namespace: str = "proyecto", table_name: str = "grupo2"):
    catalog = load_catalog(
        "nessie",
        uri="http://nessie:19120/iceberg/",
        type="rest"
    )
    table = catalog.load_table(f"{namespace}.{table_name}")
    scan = table.scan()
    file_tasks = scan.plan_files()

    for task in file_tasks:
        file_path = task.file.file_path
        try:
            table = pq.read_table(file_path)
            for record in table.to_pylist():
                yield record
        except Exception as e:
            logger.error(f"Error reading file {file_path}: {e}")


pipeline = dlt.pipeline(
    pipeline_name="iceberg_to_clickhouse",
    destination="clickhouse",
    dataset_name="grupo2_proyecto"
)

In [38]:
data = parquet_files(namespace="proyecto", table_name="grupo2")

In [40]:
try:
    load_info = pipeline.run(data)
    logger.info(f"Pipeline sucessfully executed. Load info: {load_info}")
except Exception as e:
    logger.error(f"Error while executing pipeline: {e}")
    raise

2025-09-09 16:02:02,605 - INFO - Pipeline sucessfully executed. Load info: Pipeline iceberg_to_clickhouse load step completed in 4.05 seconds
1 load package(s) were loaded to destination clickhouse and into dataset grupo2_proyecto
The clickhouse destination used clickhouse://admin_2:***@clickhouse:9000/default location to store data
Load package 1757433212.0314717 is LOADED and contains no failed jobs
