## NEW VERSION


In [0]:
# ==== EDIT THESE ====
storage_account = "peterunitystorage"
container       = "peterunitycontainer"
sas_token       = "?sp=racwdlmeop&st=2025-10-13T14:01:09Z&se=2025-10-29T23:16:09Z&spr=https&sv=2024-11-04&sr=c&sig=XuhEQOnoFpIQj47x26OSPKo87F4ZGfOL%2BYyJ69ebga8%3D"
# Hive DB name (non-UC). If UC, just USE main.<schema>
schema_name     = "demo_schema_migration" 
table_name      = "bronze_customers_demo"

# Spark auth for SAS (Azure ADLS Gen2)
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account}.dfs.core.windows.net",
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account}.dfs.core.windows.net", sas_token)

# Paths
base_path       = f"abfss://{container}@{storage_account}.dfs.core.windows.net/{table_name}"
raw_path        = f"{base_path}/raw"
schema_loc_path = f"{base_path}/_schemas"
delta_path      = f"{base_path}/delta"
chk1            = f"{base_path}/_chk_v1"
chk2            = f"{base_path}/_chk_v2"

# DB (Hive metastore for non-UC)
spark.sql(f"CREATE DATABASE IF NOT EXISTS {schema_name}")
spark.sql(f"USE {schema_name}")

# Clean up prior runs
for p in [f"{base_path}/",]:
    try: dbutils.fs.rm(p, True)
    except: pass


In [0]:
# List out the folder contents
display(dbutils.fs.ls(f"abfss://{container}@{storage_account}.dfs.core.windows.net/"))

In [0]:
# v1
dbutils.fs.mkdirs(f"{raw_path}/v1")
dbutils.fs.put(f"{raw_path}/v1/customers_1.json", """
{"id":1,"name":"Alice","age":31}
{"id":2,"name":"Bob","age":28}
""", True)

# v2 (new column + type mismatch + extra unexpected column)
dbutils.fs.mkdirs(f"{raw_path}/v2")
dbutils.fs.put(f"{raw_path}/v2/customers_2.json", """
{"id":3,"name":"Chao","age":35,"email":"chao@example.com"}
{"id":4,"name":"Dana","age":"unknown","email":"dana@example.com","country":"PL"}
""", True)


In [0]:
from pyspark.sql.functions import input_file_name

stream_df_v1 = (spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format","json")
  .option("cloudFiles.schemaLocation", schema_loc_path)
  .option("cloudFiles.inferColumnTypes","true")
  .option("cloudFiles.schemaEvolutionMode","rescue")  # capture surprises
  .option("rescuedDataColumn","_rescued_data")
  .load(f"{raw_path}/v1")
  .withColumn("_source_file", input_file_name())
)

q1 = (stream_df_v1.writeStream
  .format("delta")
  .option("checkpointLocation", chk1)
  .option("mergeSchema","true")
  .outputMode("append")
  .trigger(once=True)
  .start(delta_path))
q1.awaitTermination()

spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION '{delta_path}'")
display(spark.read.table(table_name).orderBy("id"))


In [0]:
stream_df_v2 = (spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format","json")
  .option("cloudFiles.schemaLocation", schema_loc_path)
  .option("cloudFiles.inferColumnTypes","true")
  .option("cloudFiles.schemaEvolutionMode","rescue")
  .option("rescuedDataColumn","_rescued_data")
  .load(f"{raw_path}/v2"))

q2 = (stream_df_v2.writeStream
  .format("delta")
  .option("checkpointLocation", chk2)
  .option("mergeSchema","true")
  .outputMode("append")
  .trigger(once=True)
  .start(delta_path))
q2.awaitTermination()

display(spark.read.table(table_name).orderBy("id"))


In [0]:
from pyspark.sql import Row

df_newcol = spark.createDataFrame([Row(id=5, name="Eve", age=29, vip=True)])

try:
    (df_newcol.write.format("delta").mode("append").save(delta_path))  # no mergeSchema
except Exception as e:
    print("Expected failure (schema enforcement):\n", str(e)[:500])


In [0]:
(df_newcol.write
   .format("delta")
   .mode("append")
   .option("mergeSchema","true")
   .save(delta_path))

display(spark.read.table(table_name).orderBy("id"))


In [0]:
from pyspark.sql import Row

df_a = spark.createDataFrame([Row(id=6, name="Fran", age=40)])
df_b = spark.createDataFrame([Row(id=7, name="Göran", email="g@example.com")])

merged = df_a.unionByName(df_b, allowMissingColumns=True)

(merged.write
  .format("delta")
  .mode("append")
  .option("mergeSchema","true")
  .save(delta_path))

display(spark.read.table(table_name).orderBy("id"))


#### Normalize schema before union

In [0]:
from pyspark.sql.functions import expr, lit

# Read raw JSON
df_a = spark.read.json(f"{raw_path}/v1")  # has: id, name, age (numeric)
df_b = spark.read.json(f"{raw_path}/v2")  # has: id, name, age (sometimes "unknown"), email

# 1) Normalize schemas BEFORE union:
# - df_a: add missing 'email'
# - both: cast age safely (string->long), non-numeric => NULL
df_a_norm = (
    df_a
    .withColumn("email", lit(None).cast("string"))
    .withColumn("age", expr("try_cast(age as long)"))
)

df_b_norm = (
    df_b
    .withColumn("age", expr("try_cast(age as long)"))
    # ensure email exists & is string (already is, but explicit for clarity)
    .withColumn("email", expr("cast(email as string)"))
)

# 2) Union by name allowing missing columns (now types align)
merged = df_a_norm.unionByName(df_b_norm, allowMissingColumns=True)

# 3) Write back (age stays BIGINT; 'unknown' becomes NULL)
(merged.write
   .format("delta")
   .mode("overwrite")            # overwrite for a clean combined result
   .option("mergeSchema","true") # allow adding 'email' if not present
   .save(delta_path))

display(spark.read.table(table_name).orderBy("id"))
