# 2. Parquet to Iceberg

This notebook demonstrates the process of reading a Parquet file from the `grupo-2` bucket in MinIO and saving it to another bucket using the Apache Iceberg table format. The workflow utilizes `dlt` and Iceberg libraries, with data managed as a tabular dataset using the Nessie catalog for efficient querying and versioning. Note that this requires MinIO access and the Iceberg library installation. The Notebook:

* Uses the MinIO API on port 9000 with credentials inferred from .dlt/secrets.toml.
* Reads Parquet files from a specified bucket (e.g., s3://grupo-2/grupo_2_parquet/df_data).

In [2]:
%pip install pandas pyarrow fsspec dlt[filesystem] s3fs adlfs pyiceberg[s3fs,sql-sqlite] toml

Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyarrow
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting s3fs
  Downloading s3fs-2025.9.0-py3-none-any.whl.metadata (1.4 kB)
Collecting adlfs
  Downloading adlfs-2025.8.0-py3-none-any.whl.metadata (7.7 kB)
Collecting toml
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting dlt[filesystem]
  Downloading dlt-1.16.0-py3-none-any.whl.metadata (12 kB)
Collecting pyiceberg[s3fs,sql-sqlite]
  Downloading pyiceberg-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.3.3-cp311-cp311-

In [9]:
# General utilities
import os
import toml
import logging
from typing import Optional

# Data manipulation
import pandas as pd

# dlt: Reading from filesystem
import dlt
from dlt.sources.filesystem import filesystem, read_parquet

# PyArrow: Reading and Convertion
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs

# PyIceberg
from pyiceberg.catalog import load_catalog
from pyiceberg.table import Table
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import (
    BooleanType, IntegerType, LongType, FloatType, DoubleType,
    StringType, TimestampType, DateType
)

In [10]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("parquet_to_iceberg")

In [5]:
# Configure the pipeline
try:
    pipeline = dlt.pipeline(
        pipeline_name="sources",  
        destination="filesystem",
    )
    logger.info(f"Pipeline configured successfully with name: {pipeline.pipeline_name}")
except Exception as e:
    logger.error(f"Error configuring pipeline: {str(e)}")
    raise


2025-09-09 03:24:38,691 - INFO - Pipeline configured successfully with name: sources


In [None]:
filesystem_source = filesystem() | read_parquet()

In [None]:
info = pipeline.run(filesystem_pipe)
print(info)

In [None]:
print(pipeline.last_trace.last_normalize_info)

In [11]:
# Load config file
config = toml.load("/home/jovyan/work/.dlt/secrets.toml")

# Extract credentials
creds = config["sources"]["credentials"]

# Export env var
os.environ["AWS_ACCESS_KEY_ID"] = creds["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws_secret_access_key"]
os.environ["AWS_ENDPOINT_URL"] = creds.get("endpoint_url", "")  

In [12]:
dataset = ds.dataset(
    source="s3://grupo-2/grupo_2_parquet/df_data", 
    format="parquet"
)

# Convertir a Arrow Table
table = dataset.to_table()

In [13]:
# Show schema
print(table.schema)

vendor_id: int32
tpep_pickup_datetime: timestamp[us]
tpep_dropoff_datetime: timestamp[us]
passenger_count: double
trip_distance: double
ratecode_id: double
store_and_fwd_flag: string
pu_location_id: int32
do_location_id: int32
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
cbd_congestion_fee: double


In [15]:
catalog = load_catalog(
    "nessie",
    uri="http://nessie:19120/iceberg/",
    type="rest"
)

namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: []


In [16]:
catalog.create_namespace("proyecto")

In [17]:
namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: [('proyecto',)]


In [18]:
table_schema = dataset.schema

In [19]:
table_location = "s3://bucket-2/"

try:
    catalog.create_table(
        "proyecto.grupo2",
        schema=table_schema,
        location=table_location
    )
    logger.info(f"Table 'proyecto.grupo2' successfully created at location '{table_location}'.")
except Exception as e:
    logger.exception("Unexpected error during table creation.")

2025-09-11 01:10:37,903 - INFO - Table 'proyecto.grupo2' successfully created at location 's3://bucket-2/'.


In [20]:
try:
    dataset = catalog.load_table("proyecto.grupo2")
    logger.info("Table 'proyecto.grupo2' loaded successfully.")
except Exception as e:
    logger.exception("Unexpected error while loading 'proyecto.grupo2'.")

2025-09-11 01:12:24,077 - INFO - Table 'proyecto.grupo2' loaded successfully.


In [21]:
try:
    dataset.append(table)
    logger.info("Data successfully appended to 'proyecto.grupo2'.")
except Exception as e:
    logger.exception("Unexpected error during append operation.")

2025-09-11 01:12:28,310 - INFO - Data successfully appended to 'proyecto.grupo2'.
