In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [2]:
import boto3
boto3.setup_default_session(profile_name='adfs')
def active_iceberg_table_metadata(active_database_name, active_table_name):
    glue = boto3.client("glue")
    table = glue.get_table(DatabaseName=active_database_name, Name=active_table_name)
    parameters = table["Table"]["Parameters"]
    full_path_metadata_location = parameters["metadata_location"]
    full_path_previous_metadata_location = parameters["previous_metadata_location"]
    return full_path_metadata_location.split('/')[-1], full_path_previous_metadata_location.split('/')[-1]

print(active_iceberg_table_metadata("zero", "bg1active"))    

('00007-20a0f8eb-2f0e-4809-a960-99711ccb5bc5.metadata.json', '00006-ff1eefe4-94f3-4627-b2cf-4ad1adf357a6.metadata.json')


In [3]:
sp_conf = SparkConf() 
sp_conf.set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
sp_conf.set("spark.sql.catalog.glue_catalog.warehouse", "s3://mait-warehouse/")
sp_conf.set("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
sp_conf.set("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
sp_conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")


<pyspark.conf.SparkConf at 0xffff800969b0>

In [4]:
spark = SparkSession.builder \
    .appName("Glue-Iceberg Integration") \
    .config(conf=sp_conf) \
    .getOrCreate()

spark.conf.set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.glue_catalog.warehouse", "s3://mait-warehouse/")
spark.conf.set("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
spark.conf.set("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
spark.conf.set("spark.sql.defaultCatalog","glue_catalog")
spark.conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.DefaultAWSCredentialsProviderChain")
spark.conf.set("spark.hadoop.fs.s3a.aws.profile", "com.amazonaws.auth.profile.DefaultAWSCredentialsProviderChain")
spark._jsc.hadoopConfiguration().set("fs.s3a.aws.profile", "adfs")
spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")



25/07/28 00:23:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
# spark.sql("""
#     CREATE TABLE glue_catalog.zero.bg1active (
#         id INT,
#         name STRING
#     )
#     USING iceberg TBLPROPERTIES ('write.metadata.path' = 's3://mait-warehouse/zero.db/bg1active/metadata-east/');
# """)

In [6]:
spark.sql("select count(*) from glue_catalog.zero.bg1active").show()
spark.sql("select count(*) from glue_catalog.zero.bg1passive").show()

+--------+
|count(1)|
+--------+
|     700|
+--------+

+--------+
|count(1)|
+--------+
|     400|
+--------+



In [8]:
import random
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

write_to_db = "zero"
write_to_table = "bg1active"

data = [(i, f"name_{random.randint(1000, 9999)}") for i in range(100)]

# Step 2: Create DataFrame with schema id(int), name(string)
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False)
])
df = spark.createDataFrame(data, schema)

df.createOrReplaceTempView("temp_table1")

df.show()

spark.sql(f"""
    INSERT INTO glue_catalog.{write_to_db}.{write_to_table}
    SELECT id, name FROM temp_table1
""")

+---+---------+
| id|     name|
+---+---------+
|  0|name_7193|
|  1|name_2758|
|  2|name_2408|
|  3|name_8596|
|  4|name_8167|
|  5|name_5507|
|  6|name_1583|
|  7|name_9878|
|  8|name_4645|
|  9|name_2891|
| 10|name_5752|
| 11|name_8196|
| 12|name_1390|
| 13|name_2557|
| 14|name_7962|
| 15|name_1669|
| 16|name_7889|
| 17|name_9543|
| 18|name_1309|
| 19|name_2807|
+---+---------+
only showing top 20 rows



DataFrame[]

In [9]:
spark.sql("""
  CALL glue_catalog.system.rewrite_table_path(
    table => 'zero.bg1active',
    source_prefix => 's3://mait-warehouse/',
    target_prefix => 's3://mait-warehouse-replica/',
    staging_location => 's3a://mait-warehouse/zero.db/bg1active/staging_west'
  )
""")

25/07/28 00:24:17 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/07/28 00:24:19 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/07/28 00:24:19 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

DataFrame[latest_version: string, file_list_location: string]