### Let's start with creating some data with pyspark local session

In [1]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

#### create spark session
- as we can see, the dependencies (iceberg, aws, etc.) were correctly loaded to the session because of the config

In [2]:
!export PYSPARK_SUBMIT_ARGS='--master spark://spark-master:7077\
        --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\
        --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog\
        --conf spark.sql.catalog.spark_catalog.type=hive\
        --conf spark.sql.catalog.iceberg_jdbc.type=jdbc\
        --conf spark.sql.catalog.iceberg_jdbc.uri=jdbc:postgresql://postgres:5432/iceberg_catalog\
        --conf spark.sql.catalog.iceberg_jdbc.jdbc.user=iceberg\
        --conf spark.sql.catalog.iceberg_jdbc.jdbc.password=icebergpassword\
        --conf spark.sql.catalog.iceberg_jdbc.driver=org.postgresql.Driver\
        --conf spark.sql.catalog.iceberg_jdbc.warehouse=s3a://iceberg-warehouse/\
        --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000\
        --conf spark.hadoop.fs.s3a.access.key=admin\
        --conf spark.hadoop.fs.s3a.secret.key=password\
        --conf spark.hadoop.fs.s3a.path.style.access=true\
        --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false\
        --conf spark.driver.extraJavaOptions="-Daws.region=eu-central-1 -Daws.overrideDefaultRegion=true"\
        --conf spark.executor.extraJavaOptions="-Daws.region=eu-central-1 -Daws.overrideDefaultRegion=true"\
        --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1,org.apache.hadoop:hadoop-aws:3.3.4,org.postgresql:postgresql:42.6.0,org.apache.spark:spark-avro_2.12:3.5.6\
        pyspark-shell'

In [3]:
import os
os.environ['PYSPARK_SUBMIT_ARGS']='''--master spark://spark-master:7077\
        --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\
        --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog\
        --conf spark.sql.catalog.spark_catalog.type=hive\
        --conf spark.sql.catalog.iceberg_jdbc.type=jdbc\
        --conf spark.sql.catalog.iceberg_jdbc.uri=jdbc:postgresql://postgres:5432/iceberg_catalog\
        --conf spark.sql.catalog.iceberg_jdbc.jdbc.user=iceberg\
        --conf spark.sql.catalog.iceberg_jdbc.jdbc.password=icebergpassword\
        --conf spark.sql.catalog.iceberg_jdbc.driver=org.postgresql.Driver\
        --conf spark.sql.catalog.iceberg_jdbc.warehouse=s3a://iceberg-warehouse/\
        --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000\
        --conf spark.hadoop.fs.s3a.access.key=admin\
        --conf spark.hadoop.fs.s3a.secret.key=password\
        --conf spark.hadoop.fs.s3a.path.style.access=true\
        --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false\
        --conf spark.driver.extraJavaOptions="-Daws.region=eu-central-1 -Daws.overrideDefaultRegion=true"\
        --conf spark.executor.extraJavaOptions="-Daws.region=eu-central-1 -Daws.overrideDefaultRegion=true"\
        --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1,org.apache.hadoop:hadoop-aws:3.3.4,org.postgresql:postgresql:42.6.0,org.apache.spark:spark-avro_2.12:3.5.6\
        pyspark-shell'''

In [4]:
spark = (SparkSession
         .builder
         .master("local[4]")
         .config("spark.executor.memory", "6g")
         .config("spark.driver.memory", "6g")
         .config("spark.hadoop.fs.s3a.endpoint.region","eu-central-1")
         .appName("pyiceberg")
        ).getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.postgresql#postgresql added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ba5466ea-dcdd-40d0-99cf-a82a409853fa;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.postgresql#postgresql;42.6.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found org.apache.spark#spark-avro_2.12;3.5.6 in central
	found org.tukaani#xz;1.9 in central
:: resolution report :: resolve 191ms :: artifacts dl 7ms
	:: modules in use:
	

let's create a table from the json structure
- first explode the array 'records'
- then transform the struct field with select "col.*"
- then transform string fields to dates

In [5]:
df = spark.read.json("testdata/")

                                                                                

In [6]:
df.printSchema()

root
 |-- dataset: string (nullable = true)
 |-- limit: long (nullable = true)
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- HourDK: string (nullable = true)
 |    |    |-- HourUTC: string (nullable = true)
 |    |    |-- PriceArea: string (nullable = true)
 |    |    |-- SpotPriceDKK: double (nullable = true)
 |    |    |-- SpotPriceEUR: double (nullable = true)
 |-- sort: string (nullable = true)
 |-- total: long (nullable = true)



In [7]:
df2=df.select(F.explode("records"))
df2.printSchema()

root
 |-- col: struct (nullable = true)
 |    |-- HourDK: string (nullable = true)
 |    |-- HourUTC: string (nullable = true)
 |    |-- PriceArea: string (nullable = true)
 |    |-- SpotPriceDKK: double (nullable = true)
 |    |-- SpotPriceEUR: double (nullable = true)



In [8]:
df3=df2.select(F.col("col.*"))
df3.printSchema()

root
 |-- HourDK: string (nullable = true)
 |-- HourUTC: string (nullable = true)
 |-- PriceArea: string (nullable = true)
 |-- SpotPriceDKK: double (nullable = true)
 |-- SpotPriceEUR: double (nullable = true)



In [9]:
df4=df3.select(F.to_timestamp("HourDK").alias("HourDK"),
            F.to_timestamp("HourUTC").alias("HourUTC"),
           "PriceArea", "SpotPriceDKK","SpotPriceEUR")
df4.printSchema()

root
 |-- HourDK: timestamp (nullable = true)
 |-- HourUTC: timestamp (nullable = true)
 |-- PriceArea: string (nullable = true)
 |-- SpotPriceDKK: double (nullable = true)
 |-- SpotPriceEUR: double (nullable = true)



In [10]:
df4.describe().show(truncate=False)



+-------+---------+------------------+-----------------+
|summary|PriceArea|SpotPriceDKK      |SpotPriceEUR     |
+-------+---------+------------------+-----------------+
|count  |1794951  |1494238           |1494262          |
|mean   |NULL     |339.9898769339158 |45.64850028315928|
|stddev |NULL     |374.63314435438866|50.34498024112369|
|min    |DE       |-3723.469971      |-500.019989      |
|max    |SYSTEM   |18173.359375      |2436.629883      |
+-------+---------+------------------+-----------------+



                                                                                

In [11]:
df = (spark.read.json("testdata/")
        .select(F.explode("records"))
        .select(F.col("col.*"))
        .select(F.to_timestamp("HourDK").alias("HourDK"),
            F.to_timestamp("HourUTC").alias("HourUTC"),
           "PriceArea", "SpotPriceDKK","SpotPriceEUR")
     )

In [12]:
df.count()

                                                                                

1794951

In [13]:
df.printSchema()

root
 |-- HourDK: timestamp (nullable = true)
 |-- HourUTC: timestamp (nullable = true)
 |-- PriceArea: string (nullable = true)
 |-- SpotPriceDKK: double (nullable = true)
 |-- SpotPriceEUR: double (nullable = true)



In [14]:
# df5_3.printSchema()

perfect

now select iceberg catalog to work with and create db for the dataframe

In [16]:
spark.conf.set("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.iceberg_catalog.catalog-impl", "org.apache.iceberg.jdbc.JdbcCatalog") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.uri", "jdbc:postgresql://postgres:5432/iceberg_catalog")
spark.conf.set("spark.sql.catalog.iceberg_catalog.jdbc.user", "iceberg") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.jdbc.password", "icebergpassword") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.warehouse", "s3a://iceberg-warehouse/")


In [17]:
spark.conf.set("spark.sql.catalog.iceberg_jdbc", "org.apache.iceberg.spark.SparkCatalog")
# spark.conf.set("spark.sql.catalog.iceberg_catalog.catalog-impl", "org.apache.iceberg.jdbc.JdbcCatalog") 
spark.conf.set("spark.sql.catalog.iceberg_jdbc.uri", "jdbc:postgresql://postgres:5432/iceberg_catalog")
spark.conf.set("spark.sql.catalog.iceberg_jdbc.jdbc.user", "iceberg") 
spark.conf.set("spark.sql.catalog.iceberg_jdbc.jdbc.password", "icebergpassword") 
spark.conf.set("spark.sql.catalog.iceberg_jdbc.warehouse", "s3a://iceberg-warehouse/")

In [23]:
spark.catalog.setCurrentCatalog('iceberg_jdbc')

In [20]:
spark.catalog.setCurrentCatalog("iceberg_catalog")

In [21]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='iceberg_catalog', description=None),
 CatalogMetadata(name='iceberg_jdbc', description=None),
 CatalogMetadata(name='spark_catalog', description=None)]

In [24]:
spark.catalog.listDatabases()

[Database(name='spark_schema', catalog='iceberg_jdbc', description=None, locationUri='s3a://iceberg-warehouse/spark_schema')]

ok, let's do this on bigger data

and save it to minio using iceberg

In [25]:
(df4.writeTo(f"iceberg_jdbc.spark_schema.elspotprices")
    .using("iceberg")
    .partitionedBy(F.col("PriceArea"),F.months(F.col("HourUTC")))
).createOrReplace()

25/07/10 10:42:02 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/07/10 10:42:06 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to spark_schema/elspotprices/data/PriceArea=DK2/HourUTC_month=2021-11/00002-37-6a176615-9c39-4f8e-9270-486992c5f7a3-0-00007.parquet. This is unsupported
                                                                                

25/06/16 11:25:54 WARN Tasks: Retrying task after failure: sleepTimeMs=5160 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:25:59 WARN Tasks: Retrying task after failure: sleepTimeMs=5163 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:04 WARN Tasks: Retrying task after failure: sleepTimeMs=5489 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:09 WARN Tasks: Retrying task after failure: sleepTimeMs=5384 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:15 WARN Tasks: Retrying task after failure: sleepTimeMs=5437 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:20 WARN Tasks: Retrying task after failure: sleepTimeMs=5471 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:26 WARN Tasks: Retrying task after failure: sleepTimeMs=5272 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:31 WARN Tasks: Retrying task after failure: sleepTimeMs=5115 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:36 WARN Tasks: Retrying task after failure: sleepTimeMs=5179 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:41 WARN Tasks: Retrying task after failure: sleepTimeMs=5087 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:46 WARN Tasks: Retrying task after failure: sleepTimeMs=5457 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:52 WARN Tasks: Retrying task after failure: sleepTimeMs=5049 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:26:57 WARN Tasks: Retrying task after failure: sleepTimeMs=5351 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

25/06/16 11:27:02 WARN Tasks: Retrying task after failure: sleepTimeMs=5254 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b521a475-bd9e-40c1-bc3d-0930f611a5ed.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

in minio, the data was correctly saved in partitioned parquet,  
for example there is a file:  
`iceberg-warehouse/spark_schema/elspotprices/data/PriceArea=DE/HourUTC_month=2009-06/`

In [31]:
df4.createTempView("elspotprices2")

In [32]:
spark.sql("create table iceberg_jdbc.spark_schema.elspotprices2 as select * from elspotprices2 limit 0")


DataFrame[]

In [33]:
df4.repartition(1000).writeTo(f"iceberg_jdbc.spark_schema.elspotprices2").option("maxRecordsPerFile", 1000).using('iceberg').append()

                                                                                

In [35]:
spark.sql(f"""
    SELECT file_path, record_count, file_size_in_bytes
    FROM iceberg_jdbc.spark_schema.elspotprices2.files
""").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------+------------+------------------+
|file_path                                                                                                            |record_count|file_size_in_bytes|
+---------------------------------------------------------------------------------------------------------------------+------------+------------------+
|s3a://iceberg-warehouse/spark_schema/elspotprices2/data/00000-45-17b97615-6f6f-42a3-a390-5e413e27506f-0-00001.parquet|1795        |37490             |
|s3a://iceberg-warehouse/spark_schema/elspotprices2/data/00001-46-17b97615-6f6f-42a3-a390-5e413e27506f-0-00001.parquet|1795        |37300             |
|s3a://iceberg-warehouse/spark_schema/elspotprices2/data/00002-47-17b97615-6f6f-42a3-a390-5e413e27506f-0-00001.parquet|1795        |37084             |
|s3a://iceberg-warehouse/spark_schema/elspotprices2/data/00003-48-17b97615-6f6f-42a3-a39

In [36]:
result_df = spark.sql(f"""CALL iceberg_jdbc.system.rewrite_data_files(
                      table => 'spark_schema.elspotprices2', 
                      strategy => 'sort', sort_order => 'HourUTC ASC',
                      options => map('min-input-files','5'))""")


                                                                                

In [37]:
spark.sql(f"""SELECT file_path, record_count, file_size_in_bytes
FROM iceberg_jdbc.spark_schema.elspotprices2.files""").show(truncate=False)


+-----------------------------------------------------------------------------------------------------------------------+------------+------------------+
|file_path                                                                                                              |record_count|file_size_in_bytes|
+-----------------------------------------------------------------------------------------------------------------------+------------+------------------+
|s3a://iceberg-warehouse/spark_schema/elspotprices2/data/00000-1078-d3962320-457e-492e-bf3e-8d724c580184-0-00001.parquet|1794951     |11741513          |
+-----------------------------------------------------------------------------------------------------------------------+------------+------------------+



In [95]:
result_df.select('rewritten_bytes_count').show()

+---------------------+
|rewritten_bytes_count|
+---------------------+
|                    0|
+---------------------+



In [39]:
spark.sql("SELECT * FROM iceberg_jdbc.spark_schema.elspotprices2.snapshots ORDER BY committed_at DESC").show(truncate=False)

+-----------------------+-------------------+-------------------+---------+--------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id          |operation|manifest_list                                                                                                                   |summary                                                       

In [40]:
spark.sql("SELECT * FROM iceberg_jdbc.spark_schema.elspotprices2.snapshots ORDER BY committed_at DESC").toPandas()

Unnamed: 0,committed_at,snapshot_id,parent_id,operation,manifest_list,summary
0,2025-07-10 10:46:45.990,1369744969703132205,4.092833e+18,replace,s3a://iceberg-warehouse/spark_schema/elspotpri...,"{'engine-version': '3.5.6', 'added-data-files'..."
1,2025-07-10 10:44:37.174,4092832995318799649,2.327347e+18,append,s3a://iceberg-warehouse/spark_schema/elspotpri...,"{'engine-version': '3.5.6', 'added-data-files'..."
2,2025-07-10 10:44:18.269,2327347172594696498,,append,s3a://iceberg-warehouse/spark_schema/elspotpri...,"{'spark.app.id': 'local-1752144072633', 'chang..."


In [42]:
import pyarrow

In [43]:
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs 

In [44]:
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.catalog import load_catalog

In [46]:
from pyarrow.fs import S3FileSystem

In [52]:
s3 = S3FileSystem(access_key="admin",secret_key="password",endpoint_override="http://minio:9000/")

In [48]:
import s3fs

In [49]:
s3fs_fs=s3fs.S3FileSystem(key="admin",secret="password",endpoint_url="http://minio:9000",)

In [116]:
[*s3fs_fs.walk("iceberg-warehouse")]

[('iceberg-warehouse',
  ['my_schema', 'prices', 'spark_schema', 'trino_schema'],
  ['Elspotprices1000000.json']),
 ('iceberg-warehouse/my_schema',
  ['employees-3f3b60bb88af4b809b5b55fc3c75f70b',
   'employees-9d451407ba3a42c1ad3716401a6d4179',
   'events-34012500edaf44e2ab4aa045bd9b1983',
   'events-d796bbaf923d4b7a9b239241bc0d9ca6'],
  []),
 ('iceberg-warehouse/my_schema/employees-3f3b60bb88af4b809b5b55fc3c75f70b',
  ['data', 'metadata'],
  []),
 ('iceberg-warehouse/my_schema/employees-3f3b60bb88af4b809b5b55fc3c75f70b/data',
  ['department=Engineering', 'department=HR', 'department=Sales'],
  []),
 ('iceberg-warehouse/my_schema/employees-3f3b60bb88af4b809b5b55fc3c75f70b/data/department=Engineering',
  [],
  ['20250624_214223_00005_6fr7p-c5456272-40b0-40ca-9eb3-663963f86384.parquet',
   '20250624_214224_00015_6fr7p-739512d3-447d-4e0e-9302-23236cf2f06a.parquet',
   '20250624_214224_00016_6fr7p-fcea724a-4ec3-42e2-934c-be4302c505e5.parquet',
   '20250624_214225_00018_6fr7p-31cb5047-f545

In [50]:
import pyarrow.dataset as ds

In [14]:
sc=spark.sparkContext
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", "admin")
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "password")
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000/")
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

In [55]:
df=pd.read_parquet("iceberg-warehouse/spark_schema/elspotprices2/data/",
                   filesystem=s3fs_fs,engine='pyarrow')

In [58]:
s3 = S3FileSystem(access_key='admin',secret_key="password",region='eu-central-1',)

In [59]:
s3.region 

'eu-central-1'

#### PyIceberg Catalog Initialization

In [62]:
sql_user="iceberg"
sql_password="icebergpassword"

In [63]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    "iceberg_catalog",
    type="sql",
    uri=f"postgresql+psycopg2://{sql_user}:{sql_password}@postgres:5432/iceberg_catalog",
    s3_endpoint="http://minio:9000",
    s3_access_key_id="admin",
    s3_secret_access_key="password"
)

In [64]:
from pyiceberg.catalog import load_catalog

catalogicj = load_catalog(
    "iceberg_jdbc",
    type="sql",
    uri=f"postgresql+psycopg2://{sql_user}:{sql_password}@postgres:5432/iceberg_catalog",
    s3_endpoint="http://minio:9000",
    s3_access_key_id="admin",
    s3_secret_access_key="password",

)

In [65]:
catalog.list_namespaces()

[('trino_schema',)]

In [66]:
catalogicj.list_namespaces()

[('spark_schema',)]

In [67]:
import pyiceberg.table.inspect
from pyiceberg import table
import pyiceberg

In [None]:
## pyiceberg.table.inspect.ManifestContent()

In [68]:
S3_ROLE_ARN

NameError: name 'S3_ROLE_ARN' is not defined

In [69]:
catalog.list_tables("trino_schema")


[('trino_schema', 'employees'), ('trino_schema', 'events')]

In [70]:
catalogicj.list_tables("spark_schema")

[('spark_schema', 'elspotprices'),
 ('spark_schema', 'elspotprices2'),
 ('spark_schema', 'spark_orders')]

In [19]:
import pyarrow.parquet as pq

In [71]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

'false'

In [72]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled","true")

In [73]:
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow as pa

In [74]:
pdf=df4.toPandas()

                                                                                

In [75]:
pyatbl=pyarrow.Table.from_pandas(pdf,preserve_index=False)

In [76]:
pyatbl.schema

HourDK: timestamp[us]
HourUTC: timestamp[us]
PriceArea: string
SpotPriceDKK: double
SpotPriceEUR: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 687

In [77]:
catalogicj.load_table("spark_schema.elspotprices2")

OSError: When reading information for key 'spark_schema/elspotprices2/metadata/00002-3ead18a1-b6a9-4751-bc6d-ac861439629f.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

In [151]:
table=catalogicj.create_table("spark_schema.pyatbl",pyatbl.schema,location="s3://iceberg-warehouse")

OSError: When getting information for key 'metadata/00000-33986a06-74d0-4289-b104-d9f329c7a88c.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

In [136]:
table

pyatbl(
  1: HourDK: optional timestamp,
  2: HourUTC: optional timestamp,
  3: PriceArea: optional string,
  4: SpotPriceDKK: optional double,
  5: SpotPriceEUR: optional double
),
partition by: [],
sort order: [],
snapshot: null

In [142]:
pyitbl=catalogicj.load_table("spark_schema.pyatbl",)

In [78]:
catalog.create_table("trino_schema.pyatbl",pyatbl.schema,location="iceberg-warehouse")

pyatbl(
  1: HourDK: optional timestamp,
  2: HourUTC: optional timestamp,
  3: PriceArea: optional string,
  4: SpotPriceDKK: optional double,
  5: SpotPriceEUR: optional double
),
partition by: [],
sort order: [],
snapshot: null

#### Create Namespace

In [6]:
catalog.properties

NameError: name 'catalog' is not defined

In [191]:
from pyiceberg.catalog.sql import SqlCatalog

catalog = SqlCatalog(
    "default",
    uri=f"sqlite:///pyiceberg_catalog.db",
    warehouse=f"file://",
)

catalog.create_namespace("default")

In [196]:
catalog.list_namespaces()

[('default',)]

In [201]:
catalog.create_table("default.table",pyatbl.schema,location="./")

TableAlreadyExistsError: Table default.table already exists

In [202]:
tbl=catalog.load_table("default.table")

[('default', 'table')]

In [113]:
catalog.load_table("prices.elspotprices")

NoSuchTableError: Table does not exist: prices.elspotprices

#### Create Table

#### Insert Data

#### Read Data

In [None]:
scan = table.scan().to_arrow()
df = scan.to_pandas()
df

In [25]:
import numpy as np
import pandas as pd
import pyarrow as pa

df = pd.DataFrame({'one': [-1, np.nan, 2.5],
                   'two': ['foo', 'bar', 'baz'],
                   'three': [True, False, True]},
                   index=list('abc'))


table = pa.Table.from_pandas(df)

In [27]:
catalog.create_table("trino_schema.tab",table.schema)

OSError: When getting information for key 'trino_schema/tab/metadata/00000-b3b5287d-5413-4c2f-bf41-3e6c77e2961f.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

In [92]:
import os
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.catalog import load_catalog
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow as pa
from pyarrow import fs

In [80]:
# Access & Secret keys
pwd = 'password'
uid = 'admin'
s3location = "s3://iceberg-warehouse"
# Postgres creds
pswd = 'icebergpassword'
puid = 'iceberg'

In [81]:
catalog = SqlCatalog(
    "iceberg_jdbc",
    **{
        "uri": f"postgresql+psycopg2://{puid}:{pswd}@postgres:5432/iceberg_catalog",
        "warehouse": "s3://iceberg-warehouse",
        "s3.endpoint": "http://minio:9000",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": uid,
        "s3.secret-access-key": pwd,
        "s3.region": 'eu-central-1',
    },
)

In [82]:
catalog.list_namespaces()

[('spark_schema',)]

In [83]:
catalog.list_tables("spark_schema")

[('spark_schema', 'elspotprices'),
 ('spark_schema', 'elspotprices2'),
 ('spark_schema', 'spark_orders')]

In [84]:
tbl=catalog.create_table("spark_schema.pyatbl",pyatbl.schema)

In [85]:
catalog.list_tables("spark_schema")

[('spark_schema', 'elspotprices'),
 ('spark_schema', 'elspotprices2'),
 ('spark_schema', 'pyatbl'),
 ('spark_schema', 'spark_orders')]

In [87]:
elspotprices=catalog.load_table("spark_schema.elspotprices")

In [88]:
pyaelspotp=elspotprices.scan()

In [89]:
pyaelspotp.count()



1794951

In [90]:
arrowelspot=elspotprices.scan().to_arrow()



In [96]:
arrowelspot

pyarrow.Table
HourDK: timestamp[us, tz=UTC]
HourUTC: timestamp[us, tz=UTC]
PriceArea: large_string
SpotPriceDKK: double
SpotPriceEUR: double
----
HourDK: [[2019-10-01 01:00:00.000000Z,2019-10-01 00:00:00.000000Z,2019-09-30 23:00:00.000000Z,2019-09-30 22:00:00.000000Z,2019-09-30 21:00:00.000000Z,...,2019-09-01 06:00:00.000000Z,2019-09-01 05:00:00.000000Z,2019-09-01 04:00:00.000000Z,2019-09-01 03:00:00.000000Z,2019-09-01 02:00:00.000000Z],[2025-05-01 01:00:00.000000Z,2025-05-01 00:00:00.000000Z,2025-04-30 23:00:00.000000Z,2025-04-30 22:00:00.000000Z,2025-04-30 21:00:00.000000Z,...,2025-04-01 06:00:00.000000Z,2025-04-01 05:00:00.000000Z,2025-04-01 04:00:00.000000Z,2025-04-01 03:00:00.000000Z,2025-04-01 02:00:00.000000Z],...,[2000-01-01 00:00:00.000000Z,1999-12-31 23:00:00.000000Z,1999-12-31 22:00:00.000000Z,1999-12-31 21:00:00.000000Z,1999-12-31 20:00:00.000000Z,...,1999-12-01 05:00:00.000000Z,1999-12-01 04:00:00.000000Z,1999-12-01 03:00:00.000000Z,1999-12-01 02:00:00.000000Z,1999-12-01 0

In [118]:
# install tzdata: !apt update && apt install -y tzdata

In [119]:
ts_dk = arrowelspot.column("HourDK")
ts_utc = arrowelspot.column("HourUTC")
ts_utc=pc.local_timestamp(ts_utc)
ts_dk=pc.local_timestamp(ts_dk)

arrowelspot=(arrowelspot.set_column(arrowelspot.schema.get_field_index("HourDK"),
                                    "HourDK",
                                    ts_dk)
                        .set_column(arrowelspot.schema.get_field_index("HourUTC"),
                                    "HourUTC",
                                    ts_utc))                 

In [120]:
tbl.append(arrowelspot)

In [121]:
tbl.history()

[SnapshotLogEntry(snapshot_id=7094546992959003802, timestamp_ms=1752145954971)]

In [122]:
tbl.snapshots()

[Snapshot(snapshot_id=7094546992959003802, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1752145954971, manifest_list='s3://iceberg-warehouse/spark_schema.db/pyatbl/metadata/snap-7094546992959003802-0-215d66c4-fcff-46f1-81a8-0063c81f9d0d.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '20509271', 'added-data-files': '1', 'added-records': '1794951', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '1794951', 'total-files-size': '20509271', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0)]

In [123]:
tbl.append(pyatbl)

In [124]:
tbl.history()

[SnapshotLogEntry(snapshot_id=7094546992959003802, timestamp_ms=1752145954971),
 SnapshotLogEntry(snapshot_id=4797553530419552027, timestamp_ms=1752145967089)]

In [125]:
pdtbl=tbl.scan().to_pandas()

In [126]:
pdtbl

Unnamed: 0,HourDK,HourUTC,PriceArea,SpotPriceDKK,SpotPriceEUR
0,2023-10-31 20:00:00,2023-10-31 19:00:00,SYSTEM,482.980011,64.709999
1,2023-10-31 20:00:00,2023-10-31 19:00:00,SE3,147.559998,19.770000
2,2023-10-31 20:00:00,2023-10-31 19:00:00,DK2,939.530029,125.879997
3,2023-10-31 20:00:00,2023-10-31 19:00:00,DK1,939.530029,125.879997
4,2023-10-31 20:00:00,2023-10-31 19:00:00,SE4,147.559998,19.770000
...,...,...,...,...,...
3589897,2000-02-01 05:00:00,2000-02-01 04:00:00,DE,,
3589898,2000-02-01 04:00:00,2000-02-01 03:00:00,DE,,
3589899,2000-02-01 03:00:00,2000-02-01 02:00:00,DE,,
3589900,2000-02-01 02:00:00,2000-02-01 01:00:00,DE,,


In [127]:
tbl.snapshots()

[Snapshot(snapshot_id=7094546992959003802, parent_snapshot_id=None, sequence_number=1, timestamp_ms=1752145954971, manifest_list='s3://iceberg-warehouse/spark_schema.db/pyatbl/metadata/snap-7094546992959003802-0-215d66c4-fcff-46f1-81a8-0063c81f9d0d.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '20509271', 'added-data-files': '1', 'added-records': '1794951', 'total-data-files': '1', 'total-delete-files': '0', 'total-records': '1794951', 'total-files-size': '20509271', 'total-position-deletes': '0', 'total-equality-deletes': '0'}), schema_id=0),
 Snapshot(snapshot_id=4797553530419552027, parent_snapshot_id=7094546992959003802, sequence_number=2, timestamp_ms=1752145967089, manifest_list='s3://iceberg-warehouse/spark_schema.db/pyatbl/metadata/snap-4797553530419552027-0-bc54d0a5-bed3-4c28-ae10-5af8359ffac6.avro', summary=Summary(Operation.APPEND, **{'added-files-size': '12888999', 'added-data-files': '1', 'added-records': '1794951', 'total-data-files': '2', 'total-delete-