# Experimenting with PyIceberg

This notebooks serves as experimenting snippes with PyIceberg capabilities. It will include:

- Read operations
- Write operations (MERGE, write Hive partitioned, partition overwrite etc.)

For demo purposes, we'll follow the example provided in the documentation [here](https://py.iceberg.apache.org/api/), so we'll create a local catalog, some tables and perform operations.

Everytime the notebook is executed, we'll recreate the warehouse folder so that we don't get errors in `load_catalog` function (CREATE IF NOT EXISTS seems not to be supported).

## Create a local Catalog

In [1]:
!rm -rf /tmp/warehouse
! mkdir -p /tmp/warehouse
! mkdir -p ../data

In [2]:
from pyiceberg.catalog import load_catalog

warehouse_path = "/tmp/warehouse"
catalog = load_catalog(
    "default",
    **{
        "type": "sql",
        "uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
        "warehouse": f"file://{warehouse_path}",
    },
)


In [3]:
catalog.create_namespace("default")

ns = catalog.list_namespaces()

assert ns == [("default",)]

In [4]:
catalog.list_tables("default")

[]

## Create a table

In [5]:
from pyiceberg.schema import Schema
from pyiceberg.types import (
    TimestampType,
    FloatType,
    DoubleType,
    StringType,
    NestedField,
    StructType,
)
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform

from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform


schema = Schema(
    NestedField(field_id=1, name="datetime", field_type=TimestampType(), required=True),
    NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
    NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
    NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
    NestedField(
        field_id=5,
        name="details",
        field_type=StructType(
            NestedField(
                field_id=4, name="created_by", field_type=StringType(), required=False
            ),
        ),
        required=False,
    ),
)


partition_spec = PartitionSpec(
    PartitionField(
        source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
    )
)


# Sort on the symbol
sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))

catalog.create_table(
    identifier="default.bids",
    schema=schema,
    location="../data/bids",
    partition_spec=partition_spec,
    sort_order=sort_order,
)


bids(
  1: datetime: required timestamp,
  2: symbol: required string,
  3: bid: optional float,
  4: ask: optional double,
  5: details: optional struct<6: created_by: optional string>
),
partition by: [datetime_day],
sort order: [2 ASC NULLS FIRST],
snapshot: null

## Load a table

In [6]:
table = catalog.load_table("default.bids")

## Check if table exists

In [7]:
catalog.table_exists("default.bids")

True

## Convert to Polars DataFrame

In [8]:
table_df = table.to_polars().collect()
table_df

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]


In [20]:
table_schema = table.schema().as_arrow()
table_schema

datetime: timestamp[us] not null
  -- field metadata --
  PARQUET:field_id: '1'
symbol: large_string not null
  -- field metadata --
  PARQUET:field_id: '2'
bid: float
  -- field metadata --
  PARQUET:field_id: '3'
ask: double
  -- field metadata --
  PARQUET:field_id: '4'
details: struct<created_by: large_string>
  child 0, created_by: large_string
    -- field metadata --
    PARQUET:field_id: '6'
  -- field metadata --
  PARQUET:field_id: '5'

## Append some data

In [23]:
import polars as pl
import datetime

data = pl.DataFrame(
    {
        "datetime": [
            datetime.datetime(2023, 1, 1, 12, 0),
            datetime.datetime(2023, 1, 2, 12, 0),
            datetime.datetime(2023, 1, 3, 12, 0),
        ],
        "symbol": ["AAPL", "GOOGL", "MSFT"],
        "bid": [150.0, 2800.0, 300.0],
        "ask": [151.0, 2805.0, 305.0],
        "details": [
            {"created_by": "user1"},
            {"created_by": "user2"},
            {"created_by": None},
        ],
    },
    schema=table_schema,
)

TypeError: 'pyarrow.lib.Field' object is not subscriptable

In [24]:
data = pl.DataFrame(
    {
        "datetime": [
            datetime.datetime(2023, 1, 1, 12, 0),
            datetime.datetime(2023, 1, 2, 12, 0),
            datetime.datetime(2023, 1, 3, 12, 0),
        ],
        "symbol": ["AAPL", "GOOGL", "MSFT"],
        "bid": [150.0, 2800.0, 300.0],
        "ask": [151.0, 2805.0, 305.0],
        "details": [
            {"created_by": "user1"},
            {"created_by": "user2"},
            {"created_by": None},
        ],
    },
    schema=pl.scan_iceberg(table).collect_schema(),
)

# Write the data to the Iceberg table
table.append(data.to_arrow())


ValueError: Mismatch in fields:
┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃    ┃ Table field                                         ┃ Dataframe field                                      ┃
┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ ❌ │ 1: datetime: required timestamp                     │ 1: datetime: optional timestamp                      │
│ ❌ │ 2: symbol: required string                          │ 2: symbol: optional string                           │
│ ✅ │ 3: bid: optional float                              │ 3: bid: optional float                               │
│ ✅ │ 4: ask: optional double                             │ 4: ask: optional double                              │
│ ✅ │ 5: details: optional struct<6: created_by: optional │ 5: details: optional struct<6: created_by: optional  │
│    │ string>                                             │ string>                                              │
│ ✅ │ 6: created_by: optional string                      │ 6: created_by: optional string                       │
└────┴─────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┘


It seems that is not possible to get Iceberg table schema and pass to Polars to append some data, since it's not possible for Polars to specify `required` fields, making write operations impossible. 

One should create the table schema and data with pyarrow manually, like below, which is not ideal

In [None]:
import pyarrow as pa

df = pa.Table.from_pylist(
    [
        {
            "datetime": datetime.datetime(2023, 1, 1, 12, 0),
            "symbol": "AAPL",
            "bid": 150.0,
            "ask": 151.0,
            "details": {"created_by": "user1"},
        },
        {
            "datetime": datetime.datetime(2023, 1, 2, 12, 0),
            "symbol": "GOOGL",
            "bid": 2800.0,
            "ask": 2805.0,
            "details": {"created_by": "user2"},
        },
        {
            "datetime": datetime.datetime(2023, 1, 3, 12, 0),
            "symbol": "MSFT",
            "bid": 300.0,
            "ask": 305.0,
            "details": {"created_by": None},
        },
    ],
    schema=table.schema().as_arrow(),
)

table.append(df)

In [29]:
table.scan().to_polars()

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL""",150.0,151.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}


## Test upsert

In [38]:
df = pa.Table.from_pylist(
    [
        {
            "datetime": datetime.datetime(2023, 1, 1, 12, 0),
            "symbol": "AAPL2",
            "bid": 150.0,
            "ask": 151.0,
            "details": {"created_by": "user1"},
        },
        {
            "datetime": datetime.datetime(2023, 1, 4, 12, 0),
            "symbol": "AMZ",
            "bid": 2800.0,
            "ask": 2805.0,
            "details": {"created_by": "user1"},
        },
    ],
    schema=table.schema().as_arrow(),
)
pl.from_arrow(df)

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"


In [40]:
table.upsert(df, join_cols=["datetime"])

UpsertResult(rows_updated=1, rows_inserted=1)

In [42]:
table.scan().to_polars().sort("datetime")

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"


# Check partition overwrite

Here we'll check if partial overwrite does work. We'll use the same dataframe as before for the upsert, but we'll run an `overwrite` operation only on the partition `datetime=2023-01-01T12:00:00`

In [45]:
from pyiceberg.expressions import EqualTo

df = pa.Table.from_pylist(
    [
        {
            "datetime": datetime.datetime(2023, 1, 1, 12, 0),
            "symbol": "AAPL2",
            "bid": 150.0,
            "ask": 151.0,
            "details": {"created_by": "user1"},
        },
        {
            "datetime": datetime.datetime(2023, 1, 4, 12, 0),
            "symbol": "AMZ",
            "bid": 2800.0,
            "ask": 2805.0,
            "details": {"created_by": "user1"},
        },
    ],
    schema=table.schema().as_arrow(),
)
pl.from_arrow(df)
print(pl.from_arrow(df))
table.overwrite(df, overwrite_filter=EqualTo("datetime", "2023-01-01T12:00:00"))

shape: (2, 5)
┌─────────────────────┬────────┬────────┬────────┬───────────┐
│ datetime            ┆ symbol ┆ bid    ┆ ask    ┆ details   │
│ ---                 ┆ ---    ┆ ---    ┆ ---    ┆ ---       │
│ datetime[μs]        ┆ str    ┆ f32    ┆ f64    ┆ struct[1] │
╞═════════════════════╪════════╪════════╪════════╪═══════════╡
│ 2023-01-01 12:00:00 ┆ AAPL2  ┆ 150.0  ┆ 151.0  ┆ {"user1"} │
│ 2023-01-04 12:00:00 ┆ AMZ    ┆ 2800.0 ┆ 2805.0 ┆ {"user1"} │
└─────────────────────┴────────┴────────┴────────┴───────────┘


In [46]:
table.scan().to_polars()

datetime,symbol,bid,ask,details
datetime[μs],str,f32,f64,struct[1]
2023-01-01 12:00:00,"""AAPL2""",150.0,151.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"
2023-01-04 12:00:00,"""AMZ""",2800.0,2805.0,"{""user1""}"
2023-01-02 12:00:00,"""GOOGL""",2800.0,2805.0,"{""user2""}"
2023-01-03 12:00:00,"""MSFT""",300.0,305.0,{null}


This is cool, because it respected the filter, and it created a duplicate for ` 2023-01-04 12:00:00` since it was not specified in the filter.

## Overwrite from Polars

Polars has `read_iceberg` and `write_iceberg` predicates, let's see if they work.

In [None]:
pl_df = pl.scan_iceberg(table).sort("datetime").collect()
pl_df.write_iceberg(table, mode="overwrite")

ValueError: Mismatch in fields:
┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃    ┃ Table field                                         ┃ Dataframe field                                      ┃
┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ ❌ │ 1: datetime: required timestamp                     │ 1: datetime: optional timestamp                      │
│ ❌ │ 2: symbol: required string                          │ 2: symbol: optional string                           │
│ ✅ │ 3: bid: optional float                              │ 3: bid: optional float                               │
│ ✅ │ 4: ask: optional double                             │ 4: ask: optional double                              │
│ ✅ │ 5: details: optional struct<6: created_by: optional │ 5: details: optional struct<6: created_by: optional  │
│    │ string>                                             │ string>                                              │
│ ✅ │ 6: created_by: optional string                      │ 6: created_by: optional string                       │
└────┴─────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┘


There are still problems with how schema is evaluated for required fields.

## Partition evolution

In [53]:
from pyiceberg.transforms import IdentityTransform

with table.update_spec() as update:
    # update.add_field("symbol", IdentityTransform(), "symbol")
    update.rename_field("datetime_day", "datetime")


In [60]:
table.inspect.partitions()

pyarrow.Table
partition: struct<datetime: date32[day], symbol: large_string> not null
  child 0, datetime: date32[day]
  child 1, symbol: large_string
spec_id: int32 not null
record_count: int64 not null
file_count: int32 not null
total_data_file_size_in_bytes: int64 not null
position_delete_record_count: int64 not null
position_delete_file_count: int32 not null
equality_delete_record_count: int64 not null
equality_delete_file_count: int32 not null
last_updated_at: timestamp[ms]
last_updated_snapshot_id: int64
----
partition: [
  -- is_valid: all not null
  -- child 0 type: date32[day]
[null,null,null,null]
  -- child 1 type: large_string
[null,null,null,null]]
spec_id: [[0,0,0,0]]
record_count: [[1,2,1,1]]
file_count: [[1,2,1,1]]
total_data_file_size_in_bytes: [[2129,4238,2129,2083]]
position_delete_record_count: [[0,0,0,0]]
position_delete_file_count: [[0,0,0,0]]
equality_delete_record_count: [[0,0,0,0]]
equality_delete_file_count: [[0,0,0,0]]
last_updated_at: [[2025-06-08 14:29:55.4

## Check History

In [62]:
pl.from_arrow(table.inspect.history())

made_current_at,snapshot_id,parent_id,is_current_ancestor
datetime[ms],i64,i64,bool
2025-06-08 14:16:52.841,4550557490811420043,,True
2025-06-08 14:25:51.778,8674915965551096757,4.55055749081142e+18,True
2025-06-08 14:25:51.783,7249382411204131433,8.674915965551096e+18,True
2025-06-08 14:25:51.789,6946163348324657587,7.249382411204131e+18,True
2025-06-08 14:29:55.391,6071323844640631267,6.946163348324657e+18,True
2025-06-08 14:29:55.401,7019368666436632975,6.071323844640631e+18,True
