# 3. Minio-to-Clickhouse

In [1]:
%pip install pandas pyarrow fsspec dlt[clickhouse] s3fs adlfs pyiceberg[s3fs,sql-sqlite] toml clickhouse-connect

Note: you may need to restart the kernel to use updated packages.


In [2]:
# General utilities
import os
import toml
import logging
from typing import Optional

# Data manipulation
import pandas as pd

# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem, read_parquet

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs

# PyIceberg
from pyiceberg.catalog import load_catalog
from pyiceberg.table import Table
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import (
    BooleanType, IntegerType, LongType, FloatType, DoubleType,
    StringType, TimestampType, DateType
)

import pyiceberg

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("minio_to_clickhouse")

In [4]:
config = toml.load("/home/jovyan/work/.dlt/secrets.toml")

creds = config["parquet_to_minio"]["destination"]["filesystem"]["credentials"]

os.environ["AWS_ACCESS_KEY_ID"] = creds["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws_secret_access_key"]
os.environ["AWS_ENDPOINT_URL"] = creds.get("endpoint_url", "")  

In [5]:
@dlt.resource(name="parquet_files", write_disposition="append")
def parquet_files(namespace: str = "proyecto", table_name: str = "grupo2"):
    catalog = pyiceberg.catalog.load_catalog(
        "nessie",
        uri="http://nessie:19120/iceberg/",
        type="rest"
    )
    table = catalog.load_table(f"{namespace}.{table_name}")
    scan = table.scan()
    file_tasks = scan.plan_files()

    for task in file_tasks:
        file_path = task.file.file_path
        try:
            table = pq.read_table(file_path)
            for record in table.to_pylist():
                yield record
        except Exception as e:
            logger.error(f"Error reading file {file_path}: {e}")


pipeline = dlt.pipeline(
    pipeline_name="iceberg_to_clickhouse",
    destination="clickhouse",
    dataset_name="grupo2_proyecto"
)

In [6]:
data = parquet_files(namespace="proyecto", table_name="grupo2")
pipeline.run(data)

2025-09-09 08:28:16,807|[ERROR]|149|139651750794816|dlt|load.py|w_run_job:248|Terminal exception in job parquet_files.fbd6b40306.jsonl.gz in file /home/jovyan/.dlt/pipelines/iceberg_to_clickhouse/load/normalized/1757406007.7663105/started_jobs/parquet_files.fbd6b40306.0.jsonl.gz
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/opt/conda/lib/python3.11/site-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 791, in urlopen
 

PipelineStepFailed: Pipeline execution failed at `step=load` with exception:

<class 'dlt.load.exceptions.LoadClientJobFailed'>
Job with `job_id=parquet_files.fbd6b40306.jsonl.gz` and `load_id=1757406007.7663105` failed terminally with message: Job with `file_path=/home/jovyan/.dlt/pipelines/iceberg_to_clickhouse/load/normalized/1757406007.7663105/started_jobs/parquet_files.fbd6b40306.0.jsonl.gz` encountered unrecoverable problem: ClickHouse connection failed due to `Error HTTPSConnectionPool(host='clickhouse', port=8443): Max retries exceeded with url: /? (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f036ebac250>: Failed to establish a new connection: [Errno 111] Connection refused')) executing HTTP request attempt 1 (https://clickhouse:8443)`.. The package is aborted and cannot be retried.

In [None]:
pipeline.run([{"hello": "world"}])