### Let's first create some data with pyspark api 
  - here we will use local session

download some data

In [1]:
!wget https://api.energidataservice.dk/dataset/Elspotprices?limit=5

--2025-05-16 10:29:03--  https://api.energidataservice.dk/dataset/Elspotprices?limit=5
Resolving api.energidataservice.dk (api.energidataservice.dk)... 135.236.141.38
connected. to api.energidataservice.dk (api.energidataservice.dk)|135.236.141.38|:443... 
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/json]
Saving to: ‘Elspotprices?limit=5’

Elspotprices?limit=     [ <=>                ]     735  --.-KB/s    in 0s      

2025-05-16 10:29:03 (53.4 MB/s) - ‘Elspotprices?limit=5’ saved [735]



In [2]:
!wget https://api.energidataservice.dk/dataset/Elspotprices?limit=1000000

--2025-05-16 10:29:03--  https://api.energidataservice.dk/dataset/Elspotprices?limit=1000000
Resolving api.energidataservice.dk (api.energidataservice.dk)... 135.236.141.38
connected. to api.energidataservice.dk (api.energidataservice.dk)|135.236.141.38|:443... 
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/json]
Saving to: ‘Elspotprices?limit=1000000’

Elspotprices?limit=     [   <=>              ] 127.47M  9.36MB/s    in 14s     

2025-05-16 10:29:20 (9.37 MB/s) - ‘Elspotprices?limit=1000000’ saved [133661102]



In [3]:
mv Elspotprices\?limit\=5 Elspotprices5.json

In [4]:
mv Elspotprices\?limit\=1000000 Elspotprices1000000.json


In [1]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

#### create spark session
- as we can see, the dependencies (iceberg, aws, etc.) were correctly loaded to the session because of the config

In [2]:
spark = (SparkSession
         .builder
         .master("local[*]")
         .config("spark.executor.memory", "6g")
         .config("spark.driver.memory", "4g")
         .appName("pyiceberg")
        ).getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d162f26a-4483-4ffc-a1d6-ed5ddd3703ef;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.0 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.postgresql#postgresql;42.6.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.0/iceberg-spark-runtime-3.5_2.12-1.9.0.jar ...
	[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.0!iceberg-spark

check the data on small sample 

In [3]:
df5 = spark.read.json("Elspotprices5.json")

In [4]:
df5.toPandas()

Unnamed: 0,dataset,limit,records,total
0,Elspotprices,5,"[(2025-05-16T23:00:00, 2025-05-16T21:00:00, DE...",1787175


In [5]:
df5.printSchema()

root
 |-- dataset: string (nullable = true)
 |-- limit: long (nullable = true)
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- HourDK: string (nullable = true)
 |    |    |-- HourUTC: string (nullable = true)
 |    |    |-- PriceArea: string (nullable = true)
 |    |    |-- SpotPriceDKK: double (nullable = true)
 |    |    |-- SpotPriceEUR: double (nullable = true)
 |-- total: long (nullable = true)



let's create a table from the json structure
- first explode the array 'records'
- then transform the struct field with select "col.*"
- then transform string fields to dates

In [6]:
#explode records:
df5.select(F.explode("records")).show(truncate=False)

+-----------------------------------------------------------------------+
|col                                                                    |
+-----------------------------------------------------------------------+
|{2025-05-16T23:00:00, 2025-05-16T21:00:00, DE, 761.527023, 102.080002} |
|{2025-05-16T23:00:00, 2025-05-16T21:00:00, DK1, 761.527023, 102.080002}|
|{2025-05-16T23:00:00, 2025-05-16T21:00:00, DK2, 685.657821, 91.910004} |
|{2025-05-16T23:00:00, 2025-05-16T21:00:00, NO2, 597.777791, 80.129997} |
|{2025-05-16T23:00:00, 2025-05-16T21:00:00, SE3, 232.680526, 31.190001} |
+-----------------------------------------------------------------------+



In [7]:
df5_2 = df5.select(F.explode("records")).select(F.col("col.*"))

In [8]:
df5_2.toPandas()

Unnamed: 0,HourDK,HourUTC,PriceArea,SpotPriceDKK,SpotPriceEUR
0,2025-05-16T23:00:00,2025-05-16T21:00:00,DE,761.527023,102.080002
1,2025-05-16T23:00:00,2025-05-16T21:00:00,DK1,761.527023,102.080002
2,2025-05-16T23:00:00,2025-05-16T21:00:00,DK2,685.657821,91.910004
3,2025-05-16T23:00:00,2025-05-16T21:00:00,NO2,597.777791,80.129997
4,2025-05-16T23:00:00,2025-05-16T21:00:00,SE3,232.680526,31.190001


In [9]:
df5_2.printSchema()

root
 |-- HourDK: string (nullable = true)
 |-- HourUTC: string (nullable = true)
 |-- PriceArea: string (nullable = true)
 |-- SpotPriceDKK: double (nullable = true)
 |-- SpotPriceEUR: double (nullable = true)



In [10]:
df5_3=df5_2.select(F.to_timestamp("HourDK").alias("HourDK"),
            F.to_timestamp("HourUTC").alias("HourUTC"),
           "PriceArea", "SpotPriceDKK","SpotPriceEUR")

In [11]:
df5_3.printSchema()

root
 |-- HourDK: timestamp (nullable = true)
 |-- HourUTC: timestamp (nullable = true)
 |-- PriceArea: string (nullable = true)
 |-- SpotPriceDKK: double (nullable = true)
 |-- SpotPriceEUR: double (nullable = true)



perfect

now select iceberg catalog to work with and create db for the dataframe

In [12]:
catalog_name = "iceberg_catalog"
db_name = "prices"

In [13]:
spark.conf.set("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.iceberg_catalog.catalog-impl", "org.apache.iceberg.jdbc.JdbcCatalog") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.uri", "jdbc:postgresql://postgres_catalog:5432/iceberg_catalog")
spark.conf.set("spark.sql.catalog.iceberg_catalog.jdbc.user", "iceberg") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.jdbc.password", "icebergpassword") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.warehouse", "s3a://iceberg-warehouse/")

In [14]:
spark.catalog.setCurrentCatalog('iceberg_catalog')

ok, let's do this on bigger data

In [15]:
df = (spark.read.json("Elspotprices1000000.json")
        .select(F.explode("records"))
        .select(F.col("col.*"))
        .select(F.to_timestamp("HourDK").alias("HourDK"),
            F.to_timestamp("HourUTC").alias("HourUTC"),
           "PriceArea", "SpotPriceDKK","SpotPriceEUR")
     )

                                                                                

In [16]:
df.printSchema()

root
 |-- HourDK: timestamp (nullable = true)
 |-- HourUTC: timestamp (nullable = true)
 |-- PriceArea: string (nullable = true)
 |-- SpotPriceDKK: double (nullable = true)
 |-- SpotPriceEUR: double (nullable = true)



and save it to minio using iceberg

In [17]:
(df.writeTo(f"{catalog_name}.{db_name}.elspotprices1000000")
    .using("iceberg")
    .partitionedBy(F.col("PriceArea"),F.months(F.col("HourUTC")))
).createOrReplace()

25/05/16 15:07:17 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/05/16 15:07:17 WARN Tasks: Retrying task after failure: sleepTimeMs=104 Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b1799f4b-66cc-4d3b-83a5-4f8d3b598723.metadata.json
org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b1799f4b-66cc-4d3b-83a5-4f8d3b598723.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lamb

Py4JJavaError: An error occurred while calling o119.createOrReplace.
: org.apache.iceberg.exceptions.RuntimeIOException: Failed to open input stream for file: s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b1799f4b-66cc-4d3b-83a5-4f8d3b598723.metadata.json
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:187)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:290)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:284)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)
	at org.apache.iceberg.util.Tasks$Builder.runSingleThreaded(Tasks.java:219)
	at org.apache.iceberg.util.Tasks$Builder.run(Tasks.java:203)
	at org.apache.iceberg.util.Tasks$Builder.run(Tasks.java:196)
	at org.apache.iceberg.BaseMetastoreTableOperations.refreshFromMetadataLocation(BaseMetastoreTableOperations.java:199)
	at org.apache.iceberg.BaseMetastoreTableOperations.refreshFromMetadataLocation(BaseMetastoreTableOperations.java:176)
	at org.apache.iceberg.BaseMetastoreTableOperations.refreshFromMetadataLocation(BaseMetastoreTableOperations.java:167)
	at org.apache.iceberg.jdbc.JdbcTableOperations.doRefresh(JdbcTableOperations.java:100)
	at org.apache.iceberg.BaseMetastoreTableOperations.refresh(BaseMetastoreTableOperations.java:88)
	at org.apache.iceberg.BaseMetastoreTableOperations.current(BaseMetastoreTableOperations.java:71)
	at org.apache.iceberg.BaseMetastoreCatalog.loadTable(BaseMetastoreCatalog.java:49)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
	at java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
	at org.apache.iceberg.CachingCatalog.loadTable(CachingCatalog.java:147)
	at org.apache.iceberg.spark.SparkCatalog.load(SparkCatalog.java:844)
	at org.apache.iceberg.spark.SparkCatalog.loadTable(SparkCatalog.java:169)
	at org.apache.spark.sql.connector.catalog.TableCatalog.tableExists(TableCatalog.java:185)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:201)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriterV2.runCommand(DataFrameWriterV2.scala:201)
	at org.apache.spark.sql.DataFrameWriterV2.internalReplace(DataFrameWriterV2.scala:213)
	at org.apache.spark.sql.DataFrameWriterV2.createOrReplace(DataFrameWriterV2.scala:135)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.hadoop.fs.s3a.AWSBadRequestException: getFileStatus on s3a://iceberg-warehouse/prices/elspotprices1000000/metadata/00000-b1799f4b-66cc-4d3b-83a5-4f8d3b598723.metadata.json: com.amazonaws.services.s3.model.AmazonS3Exception: Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request; Request ID: 18400A6FB31473CC; S3 Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8; Proxy: null), S3 Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8:400 Bad Request: Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request; Request ID: 18400A6FB31473CC; S3 Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8; Proxy: null)
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:249)
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:175)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3796)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.extractOrFetchSimpleFileStatus(S3AFileSystem.java:5401)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:1465)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:1441)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:976)
	at org.apache.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:183)
	... 65 more
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request; Request ID: 18400A6FB31473CC; S3 Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8; Proxy: null), S3 Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1879)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1418)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1387)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403)
	at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1372)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$10(S3AFileSystem.java:2545)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2533)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2513)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3776)
	... 71 more


in minio, the data was correctly saved in partitioned parquet,  
for example there is a file:  
`iceberg-warehouse/prices/elspotprices1000000/data/PriceArea=DE/HourUTC_month=2009-06/00002-18-ab523e1e-0a1f-401e-b33a-175d41447f52-0-00024.parquet`

In [18]:
spark.sparkContext.uiWebUrl

'http://b24eb80a8450:4040'

In [26]:
import pyarrow

In [2]:
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.fs as fs 

In [3]:
s3 = fs.S3FileSystem(scheme="http",access_key='admin',secret_key="password",region="eu-central-1",)

In [None]:
s3.from_uri("s3://iceberg-warehouse")

#### PyIceberg Catalog Initialization

In [32]:
sql_user="iceberg"
sql_password="icebergpassword"

In [90]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    "iceberg_catalog",
    type="sql",
    uri=f"postgresql+psycopg2://{sql_user}:{sql_password}@postgres_catalog:5432/iceberg_catalog",
    s3_endpoint="http://minio:9000",
    s3_access_key_id="admin",
    s3_secret_access_key="password"
)


In [88]:


catalog_name

'iceberg_catalog'

In [77]:
import pyiceberg

In [89]:
import pyiceberg.table.inspect

In [79]:
S3_ROLE_ARN

's3.role-arn'

In [80]:
catalog.create_namespace("newnamespace")


NamespaceAlreadyExistsError: Namespace newnamespace already exists

In [81]:
catalog.list_namespaces()

[('pyiceberg_demo',),
 ('my_schema',),
 ('spark_schema',),
 ('newnamespace',),
 ('pyiceberg_namespace',),
 ('prices',)]

In [82]:
import pyarrow.parquet as pq

In [83]:
df = pq.read_table("./00000-26-21d27c5c-0880-4122-9983-c7e4b5473b61-0-00001.parquet")

In [86]:
table=catalog.create_table("newnamespace.someparquet",df.schema)

OSError: When getting information for key 'newnamespace.db/someparquet/metadata/00000-44e57e0c-a51e-41a5-a7a8-30e2397da1b6.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

In [85]:
table

NameError: name 'table' is not defined

In [50]:
catalog.load_table("prices.elspotprices")

OSError: When reading information for key 'prices/elspotprices/metadata/00003-26f1f3f6-b5e5-4357-8025-6909ea3e3eff.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

#### Create Namespace

In [54]:
catalog.properties

{'type': 'sql',
 'uri': 'postgresql+psycopg2://iceberg:icebergpassword@postgres_catalog:5432/iceberg_catalog',
 'warehouse': 's3://iceberg-warehouse/',
 's3_endpoint': 'http://minio:9000',
 's3_access_key_id': 'admin',
 's3_secret_access_key': 'password',
 's3_path_style_access': True}

In [60]:
catalog.list_tables(namespace="prices")

[('prices', 'elspotprices'),
 ('prices', 'elspotprices1000000'),
 ('prices', 'elspotprices2')]

In [61]:
catalog.load_table("prices.elspotprices").

OSError: When reading information for key 'prices/elspotprices/metadata/00003-26f1f3f6-b5e5-4357-8025-6909ea3e3eff.metadata.json' in bucket 'iceberg-warehouse': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

####Create Table

In [13]:
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType, StringType
from pyiceberg.partitioning import PartitionSpec

# schema = Schema(
#     fields=[
#         ("id", IntegerType(), False),
#         ("name", StringType(), False),
#         ("dept", StringType(), True)
#     ]
# )

In [15]:
# partition_spec = (PartitionSpec
#                   .builder_for(schema)
#                   .identity("dept")
#                   ).build()

In [17]:
table.schema

one: double
two: string
three: bool
__index_level_0__: string
-- schema metadata --
pandas: '{"index_columns": ["__index_level_0__"], "column_indexes": [{"na' + 650

In [None]:
# catalog.create_table(
#     identifier="pyiceberg_demo.people",
#     schema=schema,
#     partition_spec=partition_spec
# )

ValidationError: 3 validation errors for Schema
fields.0
  Input should be a valid dictionary or instance of NestedField [type=model_type, input_value=('id', IntegerType(), False), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
fields.1
  Input should be a valid dictionary or instance of NestedField [type=model_type, input_value=('name', StringType(), False), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
fields.2
  Input should be a valid dictionary or instance of NestedField [type=model_type, input_value=('dept', StringType(), True), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type

#### Insert Data

In [None]:
table = catalog.load_table("pyiceberg_demo.people")
table.append([
    {"id": 1, "name": "Alice", "dept": "Engineering"},
    {"id": 2, "name": "Bob", "dept": "HR"}
])

#### Read Data

In [None]:
scan = table.scan().to_arrow()
df = scan.to_pandas()
df

In [18]:
import numpy as np
import pandas as pd
import pyarrow as pa

df = pd.DataFrame({'one': [-1, np.nan, 2.5],
                   'two': ['foo', 'bar', 'baz'],
                   'three': [True, False, True]},
                   index=list('abc'))


table = pa.Table.from_pandas(df)

In [19]:
catalog.create_table("tab",table.schema)

NoSuchNamespaceError: Empty namespace identifier