# 3. Minio-to-Clickhouse

In [1]:
%pip install pandas pyarrow fsspec dlt[clickhouse] s3fs adlfs pyiceberg[s3fs,sql-sqlite] toml clickhouse-connect

Collecting clickhouse-connect
  Downloading clickhouse_connect-0.8.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting clickhouse-driver>=0.2.7 (from dlt[clickhouse])
  Downloading clickhouse_driver-0.2.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.1 kB)
Collecting gcsfs>=2022.4.0 (from dlt[clickhouse])
  Downloading gcsfs-2025.9.0-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting lz4 (from clickhouse-connect)
  Downloading lz4-4.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tzlocal (from clickhouse-driver>=0.2.7->dlt[clickhouse])
  Downloading tzlocal-5.3.1-py3-none-any.whl.metadata (7.6 kB)
Collecting google-auth>=1.2 (from gcsfs>=2022.4.0->dlt[clickhouse])
  Downloading google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-auth-oauthlib (from gcsfs>=2022.4.0->dlt[clickhouse])
  Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)


In [2]:
# General utilities
import os
import toml
import logging
from typing import Optional

# Data manipulation
import pandas as pd

# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem, read_parquet

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs

# PyIceberg
from pyiceberg.catalog import load_catalog
from pyiceberg.table import Table
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import (
    BooleanType, IntegerType, LongType, FloatType, DoubleType,
    StringType, TimestampType, DateType
)

import pyiceberg

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("minio_to_clickhouse")

In [4]:
config = toml.load("/home/jovyan/work/.dlt/secrets.toml")

creds = config["parquet_to_minio"]["destination"]["filesystem"]["credentials"]

os.environ["AWS_ACCESS_KEY_ID"] = creds["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws_secret_access_key"]
os.environ["AWS_ENDPOINT_URL"] = creds.get("endpoint_url", "")  

In [5]:
@dlt.resource(name="parquet_files", write_disposition="append")
def parquet_files(namespace: str = "proyecto", table_name: str = "grupo2"):
    catalog = pyiceberg.catalog.load_catalog(
        "nessie",
        uri="http://nessie:19120/iceberg/",
        type="rest"
    )
    table = catalog.load_table(f"{namespace}.{table_name}")
    scan = table.scan()
    file_tasks = scan.plan_files()

    for task in file_tasks:
        file_path = task.file.file_path
        try:
            table = pq.read_table(file_path)
            for record in table.to_pylist():
                yield record
        except Exception as e:
            logger.error(f"Error reading file {file_path}: {e}")


pipeline = dlt.pipeline(
    pipeline_name="iceberg_to_clickhouse",
    destination="clickhouse",
    dataset_name="grupo2_proyecto"
)

In [6]:
data = parquet_files(namespace="proyecto", table_name="grupo2")
pipeline.run(data)

LoadInfo(pipeline=<dlt.pipeline(pipeline_name='iceberg_to_clickhouse', destination='clickhouse', dataset_name='grupo2_proyecto', default_schema_name='iceberg_to_clickhouse', schema_names=['iceberg_to_clickhouse'], pipelines_dir='/home/jovyan/.dlt/pipelines', working_dir='/home/jovyan/.dlt/pipelines/iceberg_to_clickhouse')>, metrics={'1757430093.6676202': [{'started_at': DateTime(2025, 9, 9, 15, 9, 35, 908610, tzinfo=Timezone('UTC')), 'finished_at': DateTime(2025, 9, 9, 15, 9, 42, 540992, tzinfo=Timezone('UTC')), 'job_metrics': {'_dlt_pipeline_state.d00161fda4.jsonl.gz': LoadJobMetrics(job_id='_dlt_pipeline_state.d00161fda4.jsonl.gz', file_path='/home/jovyan/.dlt/pipelines/iceberg_to_clickhouse/load/normalized/1757430093.6676202/started_jobs/_dlt_pipeline_state.d00161fda4.0.jsonl.gz', table_name='_dlt_pipeline_state', started_at=DateTime(2025, 9, 9, 15, 9, 36, 57610, tzinfo=Timezone('UTC')), finished_at=DateTime(2025, 9, 9, 15, 9, 36, 106939, tzinfo=Timezone('UTC')), state='completed', 

In [8]:
pipeline.run([{"hello": "world"}], table_name="parquet_files")

LoadInfo(pipeline=<dlt.pipeline(pipeline_name='iceberg_to_clickhouse', destination='clickhouse', dataset_name='grupo2_proyecto', default_schema_name='iceberg_to_clickhouse', schema_names=['iceberg_to_clickhouse'], pipelines_dir='/home/jovyan/.dlt/pipelines', working_dir='/home/jovyan/.dlt/pipelines/iceberg_to_clickhouse')>, metrics={'1757432001.9275835': [{'started_at': DateTime(2025, 9, 9, 15, 33, 22, 9151, tzinfo=Timezone('UTC')), 'finished_at': DateTime(2025, 9, 9, 15, 33, 22, 90279, tzinfo=Timezone('UTC')), 'job_metrics': {'parquet_files.d219e1d045.jsonl.gz': LoadJobMetrics(job_id='parquet_files.d219e1d045.jsonl.gz', file_path='/home/jovyan/.dlt/pipelines/iceberg_to_clickhouse/load/normalized/1757432001.9275835/started_jobs/parquet_files.d219e1d045.0.jsonl.gz', table_name='parquet_files', started_at=DateTime(2025, 9, 9, 15, 33, 22, 57449, tzinfo=Timezone('UTC')), finished_at=DateTime(2025, 9, 9, 15, 33, 22, 82126, tzinfo=Timezone('UTC')), state='completed', remote_url=None)}}]}, de