In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [3]:
sp_conf = SparkConf() 
sp_conf.set("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog")
sp_conf.set("spark.sql.catalog.glue_catalog.warehouse", "s3://iceberg-wh-west/")
sp_conf.set("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
sp_conf.set("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
sp_conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")

<pyspark.conf.SparkConf at 0xffff9cd2e140>

In [4]:
spark = SparkSession.builder \
    .appName("Glue-Iceberg-Integration") \
    .config(conf=sp_conf) \
    .getOrCreate()

25/07/31 19:15:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS glue_catalog.berg 
""")

DataFrame[]

In [15]:
spark.sql("""
  CALL glue_catalog.system.register_table(
    table => 'berg.icetable1',
    metadata_file => 's3://iceberg-wh-west/berg.db/icetable1/metadata/00002-16fa8b62-7251-4eed-8e6c-c97d2e2594fa.metadata.json'
  )
""")

DataFrame[current_snapshot_id: bigint, total_records_count: bigint, total_data_files_count: bigint]

In [8]:
spark.sql("select count(*) from glue_catalog.berg.icetable1").show()

+--------+
|count(1)|
+--------+
|     200|
+--------+



In [22]:
import random
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

write_to_db = "berg"
write_to_table = "icetable1"

data = [(i, f"name_{random.randint(1000, 9999)}") for i in range(100)]

# Step 2: Create DataFrame with schema id(int), name(string)
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False)
])
df = spark.createDataFrame(data, schema)

df.createOrReplaceTempView("temp_table1")

df.show()

spark.sql(f"""
    INSERT INTO glue_catalog.{write_to_db}.{write_to_table}
    SELECT id, name FROM temp_table1
""")

+---+---------+
| id|     name|
+---+---------+
|  0|name_2035|
|  1|name_6140|
|  2|name_5277|
|  3|name_4601|
|  4|name_1646|
|  5|name_3085|
|  6|name_3107|
|  7|name_9353|
|  8|name_1659|
|  9|name_4114|
| 10|name_8145|
| 11|name_6816|
| 12|name_2026|
| 13|name_5889|
| 14|name_9802|
| 15|name_3176|
| 16|name_2154|
| 17|name_6764|
| 18|name_3404|
| 19|name_3525|
+---+---------+
only showing top 20 rows



                                                                                

DataFrame[]

In [39]:
spark.sql("select count(*) from glue_catalog.berg.icetable1").show()

+--------+
|count(1)|
+--------+
|     300|
+--------+



In [24]:
spark.sql("""
  CALL glue_catalog.system.rewrite_table_path(
    table => 'berg.icetable1',
    source_prefix => 's3://iceberg-wh-west',
    target_prefix => 's3://iceberg-wh-east',
    staging_location => 's3a://iceberg-wh-west/berg.db/icetable1/staging_east_metadata'
  )
""")

25/07/31 20:05:54 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/07/31 20:05:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

DataFrame[latest_version: string, file_list_location: string]

In [25]:
import boto3
def active_iceberg_table_metadata(active_database_name, active_table_name):
    glue = boto3.client("glue", region_name = 'us-west-2')
    table = glue.get_table(DatabaseName=active_database_name, Name=active_table_name)
    parameters = table["Table"]["Parameters"]
    full_path_metadata_location = parameters["metadata_location"]
    return full_path_metadata_location.split('/')[-1]

print(active_iceberg_table_metadata("berg", "icetable1"))   

00003-19cbcc94-a339-4934-949d-888a57eeb46f.metadata.json


In [38]:
def update_iceberg_table_metadata(active_database_name, active_table_name, metadata):
    glue = boto3.client("glue", region_name = 'us-west-2')
    table = glue.get_table(DatabaseName=active_database_name, Name=active_table_name)
    table_input = table["Table"]
    table_input["Parameters"]["metadata_location"] = f"s3://iceberg-wh-west/berg.db/icetable1/metadata/{metadata}"
    
    keys_to_remove = ['CreateTime', 'UpdateTime', 'IsRegisteredWithLakeFormation', 'CatalogId', 'DatabaseName', 'CreatedBy', 'VersionId', 'IsMultiDialectView']
    
    for key in keys_to_remove:
        if key in table_input: del table_input[key]

    print(table_input)
    glue.update_table(
        DatabaseName=active_database_name,
        TableInput=table_input
    )
    return
update_iceberg_table_metadata("berg", "icetable1", "00003-19cbcc94-a339-4934-949d-888a57eeb46f.metadata.json")    

{'Name': 'icetable1', 'Retention': 0, 'StorageDescriptor': {'Columns': [{'Name': 'id', 'Type': 'int', 'Parameters': {'iceberg.field.current': 'true', 'iceberg.field.id': '1', 'iceberg.field.optional': 'true'}}, {'Name': 'name', 'Type': 'string', 'Parameters': {'iceberg.field.current': 'true', 'iceberg.field.id': '2', 'iceberg.field.optional': 'true'}}], 'Location': 's3://iceberg-wh-west/berg.db/icetable1', 'AdditionalLocations': [], 'Compressed': False, 'NumberOfBuckets': 0, 'SortColumns': [], 'StoredAsSubDirectories': False}, 'TableType': 'EXTERNAL_TABLE', 'Parameters': {'metadata_location': 's3://iceberg-wh-west/berg.db/icetable1/metadata/00003-19cbcc94-a339-4934-949d-888a57eeb46f.metadata.json', 'previous_metadata_location': 's3://iceberg-wh-west/berg.db/icetable1/metadata/00002-16fa8b62-7251-4eed-8e6c-c97d2e2594fa.metadata.json', 'table_type': 'ICEBERG'}}
