In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [3]:
sp_conf = SparkConf() 
sp_conf.set("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog")
sp_conf.set("spark.sql.catalog.glue_catalog.warehouse", "s3://iceberg-wh-east/")
sp_conf.set("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
sp_conf.set("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
sp_conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")

<pyspark.conf.SparkConf at 0xffffaab88c10>

In [4]:
spark = SparkSession.builder \
    .appName("Glue-Iceberg-Integration") \
    .config(conf=sp_conf) \
    .getOrCreate()

25/07/31 19:00:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS glue_catalog.berg 
""")

DataFrame[]

In [6]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS glue_catalog.berg.icetable1 (
        id INT,
        name STRING
    )
    USING iceberg
""")

DataFrame[]

In [7]:
spark.sql("select count(*) from glue_catalog.berg.icetable1").show()

+--------+
|count(1)|
+--------+
|     100|
+--------+



In [7]:
import random
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

write_to_db = "berg"
write_to_table = "icetable1"

data = [(i, f"name_{random.randint(1000, 9999)}") for i in range(100)]

# Step 2: Create DataFrame with schema id(int), name(string)
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False)
])
df = spark.createDataFrame(data, schema)

df.createOrReplaceTempView("temp_table1")

df.show()

spark.sql(f"""
    INSERT INTO glue_catalog.{write_to_db}.{write_to_table}
    SELECT id, name FROM temp_table1
""")

+---+---------+
| id|     name|
+---+---------+
|  0|name_2257|
|  1|name_8457|
|  2|name_7685|
|  3|name_8362|
|  4|name_3968|
|  5|name_6643|
|  6|name_5634|
|  7|name_5136|
|  8|name_4861|
|  9|name_2365|
| 10|name_5435|
| 11|name_8013|
| 12|name_2825|
| 13|name_7162|
| 14|name_3668|
| 15|name_3766|
| 16|name_9029|
| 17|name_7494|
| 18|name_5732|
| 19|name_7118|
+---+---------+
only showing top 20 rows



                                                                                

DataFrame[]

In [7]:
spark.sql("select count(*) from glue_catalog.berg.icetable1").show()

+--------+
|count(1)|
+--------+
|     100|
+--------+



In [8]:
spark.sql("""
  CALL glue_catalog.system.rewrite_table_path(
    table => 'berg.icetable1',
    source_prefix => 's3://iceberg-wh-east',
    target_prefix => 's3://iceberg-wh-west',
    staging_location => 's3a://iceberg-wh-east/berg.db/icetable1/staging_west_metadata'
  )
""")

25/07/31 18:44:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/07/31 18:44:34 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/07/31 18:44:36 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

DataFrame[latest_version: string, file_list_location: string]

In [9]:
import boto3
def active_iceberg_table_metadata(active_database_name, active_table_name):
    glue = boto3.client("glue", region_name = 'us-east-1')
    table = glue.get_table(DatabaseName=active_database_name, Name=active_table_name)
    parameters = table["Table"]["Parameters"]
    full_path_metadata_location = parameters["metadata_location"]
    full_path_previous_metadata_location = parameters["previous_metadata_location"]
    return full_path_metadata_location.split('/')[-1], full_path_previous_metadata_location.split('/')[-1]

print(active_iceberg_table_metadata("berg", "icetable1"))   

('00001-41339c6e-92d1-4458-86f6-e8ebc9d89da5.metadata.json', '00000-a965cd33-3b31-4f67-97df-7bf089f8e25f.metadata.json')
