# Experimenting with PyIceberg

This notebooks serves as experimenting snippes with PyIceberg capabilities. It will include:

- Read operations
- Write operations (MERGE, write Hive partitioned, partition overwrite etc.)

For demo purposes, we'll follow the example provided in the documentation [here](https://py.iceberg.apache.org/api/), so we'll create a local catalog, some tables and perform operations.

Everytime the notebook is executed, we'll recreate the warehouse folder so that we don't get errors in `load_catalog` function (CREATE IF NOT EXISTS seems not to be supported).

## Connect to Nessie Catalog

In [1]:
from pyiceberg.catalog import load_catalog
from poor_man_lakehouse.config import settings

catalog_config = settings.ICEBERG_STORAGE_OPTIONS | {
    "type": "rest",
    "uri": settings.NESSIE_PYICEBERG_SERVER_URI,
}
catalog = load_catalog("nessie", **catalog_config)


In [2]:
print(catalog.list_namespaces())
if ("default",) not in catalog.list_namespaces():
    catalog.create_namespace("default")

ns = catalog.list_namespaces()

[('default',)]


In [3]:
catalog.list_tables("default")

[('default', 'people'), ('default', 'prova')]

## Create a table

In [4]:
from pyiceberg.schema import Schema
import pyarrow as pa
import datetime
import polars as pl
from pyiceberg.types import (
    TimestampType,
    FloatType,
    DoubleType,
    StringType,
    NestedField,
    StructType,
)
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform

from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform


schema = Schema(
    NestedField(field_id=1, name="datetime", field_type=TimestampType(), required=True),
    NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
    NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
    NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
    NestedField(
        field_id=5,
        name="details",
        field_type=StructType(
            NestedField(
                field_id=4, name="created_by", field_type=StringType(), required=False
            ),
        ),
        required=False,
    ),
)


partition_spec = PartitionSpec(
    PartitionField(
        source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
    )
)


# Sort on the symbol
sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))

if not catalog.table_exists("default.bids"):
    catalog.create_table(
        identifier="default.bids",
        schema=schema,
        location="s3://warehouse/bids",
        partition_spec=partition_spec,
        sort_order=sort_order,
    )


## Load a table

In [5]:
table = catalog.load_table("default.bids")

## Check if table exists

In [6]:
catalog.table_exists("default.bids")

True

## Convert to Polars DataFrame

In [7]:
table_df = table.scan().to_polars()
table_df

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]


## Append some data

Note that is necessary to convert the pyarrow schema to the one of the table, not the one inferred by Polars.

This is because Polars does not care about not null columns, therefore Iceberg will give errors due to schema missmatch.

In [8]:
def cast_to_pyarrow(df: pl.DataFrame, schema: pa.Schema) -> pa.Table:
    arrow_df = df.to_arrow()
    return arrow_df.cast(schema)


data = pl.DataFrame(
    {
        "datetime": [
            datetime.datetime(2023, 1, 1, 12, 0),
            datetime.datetime(2023, 1, 2, 12, 0),
            datetime.datetime(2023, 1, 3, 12, 0),
        ],
        "symbol": ["AAPL", "GOOGL", "MSFT"],
        "bid": [150.0, 2800.0, 300.0],
        "ask": [151.0, 2805.0, 305.0],
        "details": [
            {"created_by": "user1"},
            {"created_by": "user2"},
            {"created_by": None},
        ],
    },
)


In [9]:
table.append(df=cast_to_pyarrow(data, table.schema().as_arrow()))

In [10]:
table.scan().to_polars()

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL""",150.0,151.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}


## Test upsert

In [11]:
df = pa.Table.from_pylist(
    [
        {
            "datetime": datetime.datetime(2023, 1, 1, 12, 0),
            "symbol": "AAPL2",
            "bid": 150.0,
            "ask": 151.0,
            "details": {"created_by": "user1"},
        },
        {
            "datetime": datetime.datetime(2023, 1, 4, 12, 0),
            "symbol": "AMZ",
            "bid": 2800.0,
            "ask": 2805.0,
            "details": {"created_by": "user1"},
        },
    ],
    schema=table.schema().as_arrow(),
)
pl.from_arrow(df)

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"


In [12]:
table.upsert(df, join_cols=["datetime"])

UpsertResult(rows_updated=1, rows_inserted=1)

In [13]:
table.scan().to_polars().sort("datetime")

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"


# Check partition overwrite

Here we'll check if partial overwrite does work. We'll use the same dataframe as before for the upsert, but we'll run an `overwrite` operation only on the partition `datetime=2023-01-01T12:00:00`

In [14]:
from pyiceberg.expressions import EqualTo

df = pa.Table.from_pylist(
    [
        {
            "datetime": datetime.datetime(2023, 1, 1, 12, 0),
            "symbol": "AAPL2",
            "bid": 150.0,
            "ask": 151.0,
            "details": {"created_by": "user1"},
        },
        {
            "datetime": datetime.datetime(2023, 1, 4, 12, 0),
            "symbol": "AMZ",
            "bid": 2800.0,
            "ask": 2805.0,
            "details": {"created_by": "user1"},
        },
    ],
    schema=table.schema().as_arrow(),
)
pl.from_arrow(df)
print(pl.from_arrow(df))
table.overwrite(df, overwrite_filter=EqualTo("datetime", "2023-01-01T12:00:00"))

shape: (2, 5)
┌─────────────────────┬────────┬────────┬────────┬───────────┐
│ datetime            ┆ symbol ┆ bid    ┆ ask    ┆ details   │
│ ---                 ┆ ---    ┆ ---    ┆ ---    ┆ ---       │
│ datetime[μs]        ┆ str    ┆ f32    ┆ f64    ┆ struct[1] │
╞═════════════════════╪════════╪════════╪════════╪═══════════╡
│ 2023-01-01 12:00:00 ┆ AAPL2  ┆ 150.0  ┆ 151.0  ┆ {"user1"} │
│ 2023-01-04 12:00:00 ┆ AMZ    ┆ 2800.0 ┆ 2805.0 ┆ {"user1"} │
└─────────────────────┴────────┴────────┴────────┴───────────┘


In [15]:
table.scan().to_polars()

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}


This is cool, because it respected the filter, and it created a duplicate for ` 2023-01-04 12:00:00` since it was not specified in the filter.

## Overwrite from Polars

Polars has `read_iceberg` and `write_iceberg` predicates, let's see if they work.

In [16]:
pl.scan_iceberg(table, reader_override="pyiceberg").collect()

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}


There are still problems with how schema is evaluated for required fields.

In [17]:
data.write_iceberg(table, "overwrite")

ValueError: Mismatch in fields:
┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃    ┃ Table field                                         ┃ Dataframe field                                      ┃
┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ ❌ │ 1: datetime: required timestamp                     │ 1: datetime: optional timestamp                      │
│ ❌ │ 2: symbol: required string                          │ 2: symbol: optional string                           │
│ ❌ │ 3: bid: optional float                              │ 3: bid: optional double                              │
│ ✅ │ 4: ask: optional double                             │ 4: ask: optional double                              │
│ ✅ │ 5: details: optional struct<6: created_by: optional │ 5: details: optional struct<6: created_by: optional  │
│    │ string>                                             │ string>                                              │
│ ✅ │ 6: created_by: optional string                      │ 6: created_by: optional string                       │
└────┴─────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┘


## Partition evolution

In [18]:
from pyiceberg.transforms import IdentityTransform

with table.update_spec() as update:
    # update.add_field("symbol", IdentityTransform(), "symbol")
    update.rename_field("datetime_day", "datetime")


In [19]:
table.inspect.partitions()

pyarrow.Table
partition: struct<datetime: date32[day]> not null
  child 0, datetime: date32[day]
spec_id: int32 not null
record_count: int64 not null
file_count: int32 not null
total_data_file_size_in_bytes: int64 not null
position_delete_record_count: int64 not null
position_delete_file_count: int32 not null
equality_delete_record_count: int64 not null
equality_delete_file_count: int32 not null
last_updated_at: timestamp[ms]
last_updated_snapshot_id: int64
----
partition: [
  -- is_valid: all not null
  -- child 0 type: date32[day]
[null,null,null,null]]
spec_id: [[0,0,0,0]]
record_count: [[1,2,1,1]]
file_count: [[1,2,1,1]]
total_data_file_size_in_bytes: [[2129,4238,2129,2083]]
position_delete_record_count: [[0,0,0,0]]
position_delete_file_count: [[0,0,0,0]]
equality_delete_record_count: [[0,0,0,0]]
equality_delete_file_count: [[0,0,0,0]]
last_updated_at: [[2025-08-31 12:43:57.800,2025-08-31 12:43:57.800,null,null]]
...

## Check History

In [20]:
pl.from_arrow(table.inspect.history())

made_current_at,snapshot_id,parent_id,is_current_ancestor
datetime[ms],i64,i64,bool
2025-08-31 12:43:57.800,8873591517873357095,,True
