### Will try to read tables created by spark/trino and do stuff

main contribution of this notebook is finding out how to load a catalog such that it can read and write to tables

In [48]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pyarrow.fs as fs 
import pandas as pd
from pyarrow.fs import S3FileSystem
import s3fs

In [47]:
import pyiceberg
import pyiceberg.table.inspect
from pyiceberg import table
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.catalog import load_catalog

In [25]:
s3 = S3FileSystem(access_key="admin",secret_key="password",endpoint_override="http://minio:9000/")

In [16]:
s3fs_fs=s3fs.S3FileSystem(key="admin",secret="password",endpoint_url="http://minio:9000",)

In [35]:
display([*s3fs_fs.walk("iceberg-warehouse/trino_schema")][1])
display([*s3fs_fs.walk("iceberg-warehouse/spark_schema")][1])

('iceberg-warehouse/trino_schema/employees-bb1d739c6d444f788e7a1ddfbf79bc76',
 ['data', 'metadata'],
 [])

('iceberg-warehouse/spark_schema/elspotprices', ['data', 'metadata'], [])

In [36]:
df=pd.read_parquet("iceberg-warehouse/spark_schema/elspotprices2/data/",
                   filesystem=s3fs_fs,engine='pyarrow')

#### PyIceberg Catalog Initialization

In [50]:
sql_user="iceberg"
sql_password="icebergpassword"
catalog_name="iceberg_catalog"

### as we will see, with catalog created like this, it is no possible to load/create tables, but strangely, listing tables or schemas work

well it is not that strange, listing tables or schemas and creating schemas is an operation on the catalog and not on the storage


In [41]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    "iceberg_catalog",
    type="sql",
    uri=f"postgresql+psycopg2://{sql_user}:{sql_password}@postgres:5432/iceberg_catalog",
    s3_endpoint="http://minio:9000",
    s3_access_key_id="admin",
    s3_secret_access_key="password"
)

### this is ok:

In [42]:
catalog.list_namespaces()

[('spark_schema',), ('trino_schema',)]

In [43]:
catalog.list_tables("trino_schema")


[('trino_schema', 'employees'), ('trino_schema', 'events')]

In [45]:
catalog.list_tables("spark_schema")

[('spark_schema', 'elspotprices'),
 ('spark_schema', 'elspotprices2'),
 ('spark_schema', 'spark_orders')]

### while this is not:
because of this  
OSError: When reading information for key 'spark_schema/elspotprices2/metadata/00002-5db68f25-8452-4fc9-a985-520a2d6ea7ed.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

In [49]:
catalog.load_table("spark_schema.spark_orders")

OSError: When reading information for key 'spark_schema/elspotprices2/metadata/00002-5db68f25-8452-4fc9-a985-520a2d6ea7ed.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

let's create a data to play with.

In [99]:
pdf=pd.DataFrame(pd.read_json("testdata/Elspotprices_0.json")["records"].to_list())
pdf["HourUTC"]=pd.to_datetime(pdf["HourUTC"])
pdf["HourDK"]=pd.to_datetime(pdf["HourDK"])

In [100]:
patab=pa.Table.from_pandas(pdf,)
ucol=pc.cast(patab.column("HourUTC"),pa.timestamp('us'))
dcol=pc.cast(patab.column("HourDK"),pa.timestamp('us'))
patab=patab.set_column(patab.schema.get_field_index("HourUTC"),"HourUTC",ucol)
patab=patab.set_column(patab.schema.get_field_index("HourDK"),"HourDK",dcol)
patab

pyarrow.Table
HourUTC: timestamp[us]
HourDK: timestamp[us]
PriceArea: string
SpotPriceDKK: double
SpotPriceEUR: double
----
HourUTC: [[2025-07-09 21:00:00.000000,2025-07-09 21:00:00.000000,2025-07-09 21:00:00.000000,2025-07-09 21:00:00.000000,2025-07-09 21:00:00.000000,...,2023-10-31 20:00:00.000000,2023-10-31 20:00:00.000000,2023-10-31 20:00:00.000000,2023-10-31 20:00:00.000000,2023-10-31 19:00:00.000000]]
HourDK: [[2025-07-09 23:00:00.000000,2025-07-09 23:00:00.000000,2025-07-09 23:00:00.000000,2025-07-09 23:00:00.000000,2025-07-09 23:00:00.000000,...,2023-10-31 21:00:00.000000,2023-10-31 21:00:00.000000,2023-10-31 21:00:00.000000,2023-10-31 21:00:00.000000,2023-10-31 20:00:00.000000]]
PriceArea: [["NO2","DE","SE4","DK2","DK1",...,"DE","SYSTEM","NO2","SE3","SE3"]]
SpotPriceDKK: [[528.196342,782.595975,423.974547,782.595975,782.595975,...,865.190002,462,662.549988,132.179993,147.559998]]
SpotPriceEUR: [[70.800003,104.900002,56.830002,104.900002,104.900002,...,115.919998,61.900002,88.7

Creating schema works:

In [77]:
catalog.create_namespace_if_not_exists("arrow_schema")

In [79]:
catalog.list_namespaces()

[('arrow_schema',), ('spark_schema',), ('trino_schema',)]

In [82]:
{s[0]:catalog.load_namespace_properties(s[0]) for s in catalog.list_namespaces()}

{'arrow_schema': {'exists': 'true'},
 'spark_schema': {'exists': 'true', 'owner': 'jovyan'},
 'trino_schema': {'exists': 'true',
  'location': 's3a://iceberg-warehouse/trino_schema/'}}

ok, this shows, that namespace created by pyiceberg don't have any default properties, spark sets owner and trino sets location.  
would probably be good to explicitly and consistenly set some..  
i couldn't find if there is a fixed set of properties that can be set or if anything can be a property, let's try:  

In [86]:
catalog.update_namespace_properties("arrow_schema",updates={"xyz_custom_property":"whatever"})

PropertiesUpdateSummary(removed=[], updated=['xyz_custom_property'], missing=[])

ok, seems that anything

let's get back to tables, though  
this doesn't work  
OSError: When getting information for key 'arrow_schema/metadata/00000-5d2ed993-6184-40f2-91a0-bb0dc0986d43.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

In [101]:
ictab=catalog.create_table("arrow_schema.elspotprices_0",patab.schema,location="s3://iceberg-warehouse/arrow_schema/")

OSError: When getting information for key 'arrow_schema/metadata/00000-638936d2-7ecd-4e3b-97b7-03662c9dc1ea.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

the problem is somewhere between pyiceberg and pyarrow and passing arguments to connect to minio/s3.  
for interested there are related issues:   
https://github.com/apache/iceberg/issues/10709  
https://github.com/apache/iceberg-python/issues/1775  

### long story short, if the catalog is created like this, creating tables on minio works

In [105]:
# Access & Secret keys
pwd = 'password'
uid = 'admin'
s3location = "s3://iceberg-warehouse"
# Postgres creds
pswd = 'icebergpassword'
puid = 'iceberg'

In [106]:
iccatalog = SqlCatalog(
    "iceberg_catalog",
    **{
        "uri": f"postgresql+psycopg2://{puid}:{pswd}@postgres:5432/iceberg_catalog",
        "warehouse": "s3://iceberg-warehouse",
        "s3.endpoint": "http://minio:9000",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": uid,
        "s3.secret-access-key": pwd,
        "s3.region": 'eu-central-1',
    },
)

In [107]:
iccatalog.list_namespaces()

[('arrow_schema',), ('spark_schema',), ('trino_schema',)]

In [109]:
iccatalog.list_tables("spark_schema")

[('spark_schema', 'elspotprices'),
 ('spark_schema', 'elspotprices2'),
 ('spark_schema', 'spark_orders')]

#### Create Table

In [111]:
ictab=iccatalog.create_table("arrow_schema.elspotprices_0",patab.schema,location="s3://iceberg-warehouse/arrow_schema/").

In [114]:
type(ictab)

pyiceberg.table.Table

In [120]:
ictab.append(patab)

In [121]:
ictab

elspotprices_0(
  1: HourUTC: optional timestamp,
  2: HourDK: optional timestamp,
  3: PriceArea: optional string,
  4: SpotPriceDKK: optional double,
  5: SpotPriceEUR: optional double
),
partition by: [],
sort order: [],
snapshot: Operation.APPEND: id=1479150819988622150, schema_id=0

In [122]:
iccatalog.list_tables("arrow_schema")

[('arrow_schema', 'elspotprices_0')]

In [123]:
ictab_l=iccatalog.load_table("arrow_schema.elspotprices_0")

In [124]:
ictab_l

elspotprices_0(
  1: HourUTC: optional timestamp,
  2: HourDK: optional timestamp,
  3: PriceArea: optional string,
  4: SpotPriceDKK: optional double,
  5: SpotPriceEUR: optional double
),
partition by: [],
sort order: [],
snapshot: Operation.APPEND: id=1479150819988622150, schema_id=0

loading the data is done with .scan() :

In [125]:
patab_l = ictab_l.scan().to_arrow()
pdf_l = patab_l.to_pandas()
pdf_l



Unnamed: 0,HourUTC,HourDK,PriceArea,SpotPriceDKK,SpotPriceEUR
0,2025-07-09 21:00:00,2025-07-09 23:00:00,NO2,528.196342,70.800003
1,2025-07-09 21:00:00,2025-07-09 23:00:00,DE,782.595975,104.900002
2,2025-07-09 21:00:00,2025-07-09 23:00:00,SE4,423.974547,56.830002
3,2025-07-09 21:00:00,2025-07-09 23:00:00,DK2,782.595975,104.900002
4,2025-07-09 21:00:00,2025-07-09 23:00:00,DK1,782.595975,104.900002
...,...,...,...,...,...
99995,2023-10-31 20:00:00,2023-10-31 21:00:00,DE,865.190002,115.919998
99996,2023-10-31 20:00:00,2023-10-31 21:00:00,SYSTEM,462.000000,61.900002
99997,2023-10-31 20:00:00,2023-10-31 21:00:00,NO2,662.549988,88.769997
99998,2023-10-31 20:00:00,2023-10-31 21:00:00,SE3,132.179993,17.709999


#### Insert Data

#### Read Data

In [138]:
elspotprices_from_spark=iccatalog.load_table("spark_schema.elspotprices2")

In [139]:
espfp_pdf=elspotprices_from_spark.scan().to_pandas()

In [140]:
espfp_pdf

Unnamed: 0,HourDK,HourUTC,PriceArea,SpotPriceDKK,SpotPriceEUR
0,1999-07-01 00:00:00,1999-06-30 22:00:00,SE,95.730003,12.890000
1,1999-07-01 00:00:00,1999-06-30 22:00:00,SE4,,
2,1999-07-01 00:00:00,1999-06-30 22:00:00,DK1,95.730003,12.890000
3,1999-07-01 00:00:00,1999-06-30 22:00:00,SE3,,
4,1999-07-01 00:00:00,1999-06-30 22:00:00,SYS,70.199997,9.450000
...,...,...,...,...,...
1794946,2025-07-09 23:00:00,2025-07-09 21:00:00,DE,782.595975,104.900002
1794947,2025-07-09 23:00:00,2025-07-09 21:00:00,DK2,782.595975,104.900002
1794948,2025-07-09 23:00:00,2025-07-09 21:00:00,SE3,383.762969,51.439999
1794949,2025-07-09 23:00:00,2025-07-09 21:00:00,NO2,528.196342,70.800003


In [141]:
elspotprices_from_spark.history()

[SnapshotLogEntry(snapshot_id=673444911004302166, timestamp_ms=1752241397153),
 SnapshotLogEntry(snapshot_id=4259939819494638523, timestamp_ms=1752241776775),
 SnapshotLogEntry(snapshot_id=6092011551680698518, timestamp_ms=1752241902775)]

In [122]:
elspotprices_from_spark.snapshots()

[Snapshot(snapshot_id=7094546992959003802, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1752145954971, manifest_list='s3://iceberg-warehouse/spark_schema.db/pyatbl/metadata/snap-7094546992959003802-0-215d66c4-fcff-46f1-81a8-0063c81f9d0d.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '20509271', 'added-data-files': '1', 'added-records': '1794951', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '1794951', 'total-files-size': '20509271', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0)]