In [1]:
import os
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, DoubleType, StringType, LongType ,StructType
import polars as pl
import pyarrow as pa

# setup environment
os.environ["PYICEBERG_HOME"] = os.getcwd()

In [2]:
# initialize th iceberg catalog
catalog = load_catalog(name="local")
print(catalog.properties)

{'type': 'sql', 'uri': 'sqlite:////home/bobo/Desktop/iceberg_demos/iceberg_catalog/catalog.db', 'warehouse': 'file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog'}


In [3]:
# updating and inserting data(upsert)

from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType, NestedField, StringType

import pyarrow as pa
from pprint import pprint

schema = Schema(
    NestedField(1, "city", StringType(), required=True),
    NestedField(2, "inhabitants", IntegerType(), required=True),
    # Mark City as the identifier field, also known as the primary-key
    identifier_field_ids=[1]
)
catalog.create_namespace_if_not_exists("locations")
tbl = catalog.create_table("locations.cities", schema=schema)

pprint(tbl.snapshots())

[]


In [None]:
arrow_schema = pa.schema(
    [
        pa.field("city", pa.string(), nullable=False),
        pa.field("inhabitants", pa.int32(), nullable=False),
    ]
)

# Write some data
df = pa.Table.from_pylist(
    [
        {"city": "Amsterdam", "inhabitants": 921402},
        {"city": "San Francisco", "inhabitants": 808988},
        {"city": "Drachten", "inhabitants": 45019},
        {"city": "Paris", "inhabitants": 2103000},
    ],
    schema=arrow_schema
)
tbl.append(df)

tbl.snapshots()

[Snapshot(snapshot_id=7008166763460625182, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1768506372057, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-7008166763460625182-0-d7282089-124c-4e2d-b64a-33190919a6c8.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '977', 'added-data-files': '1', 'added-records': '4', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '4', 'total-files-size': '977', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0)]


In [5]:
#--Next, we'll upsert a table into the Iceberg table:

df = pa.Table.from_pylist(
    [
        # Will be updated, the inhabitants has been updated
        {"city": "Drachten", "inhabitants": 45505},

        # New row, will be inserted
        {"city": "Berlin", "inhabitants": 3432000},

        # Ignored, already exists in the table
        {"city": "Paris", "inhabitants": 2103000},
    ],
    schema=arrow_schema
)
upd = tbl.upsert(df)

assert upd.rows_updated == 1
assert upd.rows_inserted == 1

tbl.snapshots()

[Snapshot(snapshot_id=7008166763460625182, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1768506372057, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-7008166763460625182-0-d7282089-124c-4e2d-b64a-33190919a6c8.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '977', 'added-data-files': '1', 'added-records': '4', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '4', 'total-files-size': '977', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0),
 Snapshot(snapshot_id=5507731220280694092, parent_snapshot_id=7008166763460625182, sequence_number=2, timestamp_ms=1768506424942, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-5507731220280694092-0-ed3b629e-ab3a-407a-9cc1-65bbcc8dc27a.avro', summary=Summary(Operation.OVERWRITE, **{'added-files-size': '961', 'removed-files-size': '977', 'added-data-files': '1', 'deleted

In [None]:
df = tbl.to_polars()
df.collect()


city,inhabitants
str,i32
"""Berlin""",3432000
"""Drachten""",45505
"""Amsterdam""",921402
"""San Francisco""",808988
"""Paris""",2103000


In [None]:
# deleteing data
tbl.delete(delete_filter="city == 'Amsterdam'")
df = tbl.to_polars()
tbl.snapshots()

[Snapshot(snapshot_id=7008166763460625182, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1768506372057, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-7008166763460625182-0-d7282089-124c-4e2d-b64a-33190919a6c8.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '977', 'added-data-files': '1', 'added-records': '4', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '4', 'total-files-size': '977', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0),
 Snapshot(snapshot_id=5507731220280694092, parent_snapshot_id=7008166763460625182, sequence_number=2, timestamp_ms=1768506424942, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-5507731220280694092-0-ed3b629e-ab3a-407a-9cc1-65bbcc8dc27a.avro', summary=Summary(Operation.OVERWRITE, **{'added-files-size': '961', 'removed-files-size': '977', 'added-data-files': '1', 'deleted

In [7]:
df.collect()

city,inhabitants
str,i32
"""San Francisco""",808988
"""Paris""",2103000
"""Berlin""",3432000
"""Drachten""",45505


In [8]:
# schema evolution
# - add columns
try:

    with tbl.update_schema() as update:
        update.add_column("population", LongType(), "info about poulation")

except ValueError as ex:
    print(ex)


In [9]:
df = tbl.to_polars()
df.collect()


city,inhabitants,population
str,i32,i64
"""San Francisco""",808988,
"""Paris""",2103000,
"""Berlin""",3432000,
"""Drachten""",45505,


In [10]:
# - rename column

with tbl.update_schema() as update:
    update.rename_column("population", "country_code")

In [11]:
df = tbl.to_polars()
df.collect()

city,inhabitants,country_code
str,i32,i64
"""San Francisco""",808988,
"""Paris""",2103000,
"""Berlin""",3432000,
"""Drachten""",45505,


In [12]:
# - update column
with tbl.update_schema(allow_incompatible_changes=True) as update:
    # Promote a float to a double
    update.update_column("country_code",field_type=StringType())

In [13]:
df = tbl.to_polars()
df.collect()

city,inhabitants,country_code
str,i32,str
"""San Francisco""",808988,
"""Paris""",2103000,
"""Berlin""",3432000,
"""Drachten""",45505,


In [14]:
# - delete column
with tbl.update_schema(allow_incompatible_changes=True) as update:
    # Promote a float to a double
    update.delete_column("country_code")

In [15]:
df = tbl.to_polars()
df.collect()

city,inhabitants
str,i32
"""San Francisco""",808988
"""Paris""",2103000
"""Berlin""",3432000
"""Drachten""",45505


In [16]:
import polars as pl 

new_city = [
    {
        "city": "dublin", 
        "inhabitants": 1111111111, 
    }
]


# 3. Convert the row to a PyArrow Table
# PyIceberg expects a PyArrow table as input for writes
data_to_insert = pa.Table.from_pylist(new_city,schema=arrow_schema)

tbl.append(data_to_insert)

In [17]:
tbl.history()

[SnapshotLogEntry(snapshot_id=7008166763460625182, timestamp_ms=1768506372057),
 SnapshotLogEntry(snapshot_id=5507731220280694092, timestamp_ms=1768506424942),
 SnapshotLogEntry(snapshot_id=3134430349884940868, timestamp_ms=1768506424961),
 SnapshotLogEntry(snapshot_id=7662300186011991763, timestamp_ms=1768506424987),
 SnapshotLogEntry(snapshot_id=5569375786007764785, timestamp_ms=1768506465690),
 SnapshotLogEntry(snapshot_id=1434406823005080159, timestamp_ms=1768506574553)]

In [22]:
tbl.scan().to_polars()

city,inhabitants
str,i32
"""dublin""",1111111111
"""San Francisco""",808988
"""Paris""",2103000
"""Berlin""",3432000
"""Drachten""",45505


In [19]:
tbl.scan(snapshot_id=5569375786007764785).to_polars()

city,inhabitants
str,i32
"""San Francisco""",808988
"""Paris""",2103000
"""Berlin""",3432000
"""Drachten""",45505


In [23]:
tbl.schema()

Schema(NestedField(field_id=1, name='city', field_type=StringType(), required=True), NestedField(field_id=2, name='inhabitants', field_type=IntegerType(), required=True), schema_id=0, identifier_field_ids=[1])

In [24]:
tbl.schemas()

{0: Schema(NestedField(field_id=1, name='city', field_type=StringType(), required=True), NestedField(field_id=2, name='inhabitants', field_type=IntegerType(), required=True), schema_id=0, identifier_field_ids=[1]),
 1: Schema(NestedField(field_id=1, name='city', field_type=StringType(), required=True), NestedField(field_id=2, name='inhabitants', field_type=IntegerType(), required=True), NestedField(field_id=3, name='population', field_type=LongType(), required=False), schema_id=1, identifier_field_ids=[1]),
 2: Schema(NestedField(field_id=1, name='city', field_type=StringType(), required=True), NestedField(field_id=2, name='inhabitants', field_type=IntegerType(), required=True), NestedField(field_id=3, name='country_code', field_type=LongType(), required=False), schema_id=2, identifier_field_ids=[1]),
 3: Schema(NestedField(field_id=1, name='city', field_type=StringType(), required=True), NestedField(field_id=2, name='inhabitants', field_type=IntegerType(), required=True), NestedField(

In [87]:
tbl.history()

[SnapshotLogEntry(snapshot_id=253764842000279830, timestamp_ms=1768047871886),
 SnapshotLogEntry(snapshot_id=3256103562982925249, timestamp_ms=1768047895222),
 SnapshotLogEntry(snapshot_id=986478525678206652, timestamp_ms=1768047895234),
 SnapshotLogEntry(snapshot_id=1629192013294756382, timestamp_ms=1768047895247),
 SnapshotLogEntry(snapshot_id=362259212802847366, timestamp_ms=1768048343052)]

In [79]:
tbl.snapshot_by_id(snapshot_id=253764842000279830)

Snapshot(snapshot_id=253764842000279830, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1768047871886, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-253764842000279830-0-cfab87ce-fefd-4440-8b7b-424ae0e63daf.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '977', 'added-data-files': '1', 'added-records': '4', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '4', 'total-files-size': '977', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0)

In [80]:
tbl.snapshot_as_of_timestamp(1768047895234)

Snapshot(snapshot_id=986478525678206652, parent_snapshot_id=3256103562982925249, sequence_number=3, timestamp_ms=1768047895234, manifest_list='file:///home/bobo/Desktop/iceberg_demos/iceberg_catalog/locations/cities/metadata/snap-986478525678206652-0-3c003577-f50f-4093-8ca7-ee92adde56cb.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '911', 'added-data-files': '1', 'added-records': '1', 'total-data-files': '2', 'total-delete-files': '0', 'total-records': '4', 'total-files-size': '1872', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0)