In [None]:
#%pip install -r requirements.txt

In [None]:
spark_version = "3.5"
scala_version = "2.12"
iceberg_version = "1.7.0"

from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, rand, floor, expr

catalog_name = "iceberg"
warehouse_path = "./icehouse"

spark = SparkSession.builder \
    .appName("feature_flag_stuff") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.type", "hadoop") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", warehouse_path) \
    .config("spark.jars.packages", f"org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version}") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()

namespace = "test_ns"
spark.sql(f"create namespace {namespace}")

In [2]:
import yaml
with open("config.yaml") as f:
    config = yaml.safe_load(f)
include_last_upd_ts = config['features']['last_upd_ts']

In [None]:
spark.sql(f"""
        create or replace table {catalog_name}.{namespace}.stocks
        using iceberg
        as
        select 'MSFT' as ticker_symbol, 23.99 as price
        union all
        select 'SNOW', 34.99 as price
""")

In [4]:
spark.sql(f"select * from {catalog_name}.{namespace}.stocks").show()

+-------------+-----+
|ticker_symbol|price|
+-------------+-----+
|         MSFT|23.99|
|         SNOW|34.99|
+-------------+-----+



In [5]:
df = spark.sql("""
    select 'MSFT' as ticker_symbol, 21.45 as price
""")
df.createOrReplaceTempView("data")

In [6]:
#check if the column does not exist
if include_last_upd_ts:
    cols = spark.table(f"{catalog_name}.{namespace}.stocks").columns
    if "last_upd_ts" not in cols:
        spark.sql(f"ALTER TABLE {catalog_name}.{namespace}.stocks add column last_upd_ts timestamp")

In [7]:
update_predicate = """
    WHEN MATCHED THEN UPDATE set tgt.price = src.price
"""
if include_last_upd_ts:
    update_predicate += ", last_upd_ts = CURRENT_TIMESTAMP"

insert_predicate = """
    WHEN NOT MATCHED THEN INSERT (ticker_symbol, price
"""
if include_last_upd_ts:
    insert_predicate += ", last_upd_ts"
insert_predicate += ") VALUES (src.ticker_symbol, src.price "
if include_last_upd_ts:
    insert_predicate += ", CURRENT_TIMESTAMP"
insert_predicate += ")"


sql_template = f"""
    MERGE INTO {catalog_name}.{namespace}.stocks as tgt
        USING data as src
            on tgt.ticker_symbol = src.ticker_symbol
        {update_predicate}
        {insert_predicate}

"""

#for historical backfill
if include_last_upd_ts:
    sql_template += "\nWHEN NOT MATCHED BY SOURCE THEN UPDATE SET last_upd_ts = current_timestamp"

spark.sql(sql_template)

DataFrame[]

In [9]:
spark.sql(f"select * from {catalog_name}.{namespace}.stocks").show(truncate=False)

+-------------+-----+--------------------------+
|ticker_symbol|price|last_upd_ts               |
+-------------+-----+--------------------------+
|MSFT         |21.45|2025-05-07 08:42:41.972674|
|SNOW         |34.99|2025-05-07 08:42:41.972674|
+-------------+-----+--------------------------+

