# 3. Minio-to-Clickhouse

This notebook extracts the Parquet file from an Iceberg table tracked via the Nessie catalog and stored in MinIO object storage. The file is then ingested into ClickHouse for analytical querying. 

In [1]:
%pip install pandas pyarrow fsspec dlt[clickhouse] s3fs adlfs pyiceberg[s3fs,sql-sqlite] toml clickhouse-connect

Note: you may need to restart the kernel to use updated packages.


In [2]:
# General utilities
import os
import toml
import logging
from typing import Optional

# Data manipulation
import pandas as pd

# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem, read_parquet

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs

# PyIceberg
from pyiceberg.catalog import load_catalog
from pyiceberg.table import Table
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import (
    BooleanType, IntegerType, LongType, FloatType, DoubleType,
    StringType, TimestampType, DateType
)

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("minio_to_clickhouse")

In [4]:
@dlt.resource(name="parquet_files", write_disposition="append")
def parquet_files(namespace: str = "proyecto", table_name: str = "grupo2"):
    catalog = pyiceberg.catalog.load_catalog(
        "nessie",
        uri="http://nessie:19120/iceberg/",
        type="rest"
    )
    table = catalog.load_table(f"{namespace}.{table_name}")
    scan = table.scan()
    file_tasks = scan.plan_files()

    for task in file_tasks:
        file_path = task.file.file_path
        try:
            table = pq.read_table(file_path)
            for record in table.to_pylist():
                yield record
        except Exception as e:
            logger.error(f"Error reading file {file_path}: {e}")


pipeline = dlt.pipeline(
    pipeline_name="iceberg_to_clickhouse",
    destination="clickhouse",
    dataset_name="grupo2_proyecto"
)

In [None]:
data = parquet_files(namespace="´proyecto", table_name="grupo2")
pipeline.run(data)

In [5]:
pipeline.run([{"hello": "world"}])


Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/clickhouse_driver/connection.py", line 399, in connect
    return self._init_connection(host, port)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/clickhouse_driver/connection.py", line 329, in _init_connection
    self.socket = self._create_socket(host, port)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/clickhouse_driver/connection.py", line 301, in _create_socket
    raise err
  File "/opt/conda/lib/python3.11/site-packages/clickhouse_driver/connection.py", line 292, in _create_socket
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused


PipelineStepFailed: Pipeline execution failed at `step=sync` with exception:

<class 'dlt.destinations.exceptions.DatabaseTransientException'>
Code: 210. Connection refused (clickhouse:9440)