In [0]:
"""
https://delta.io/blog/2023-02-08-delta-lake-schema-evolution/
https://github.com/delta-io/delta-examples/blob/master/notebooks/pyspark/schema-evolution.ipynb
"""

# Delta Lake Schema Evolution

In [0]:
import delta

df = spark.createDataFrame([("bob", 47), ("li", 23), ("leonard", 51)]).toDF("first_name", "age")
df.write.format("delta").save("/FileStore/tables/delta-examples/schema-evolution")

In [0]:
df = spark.createDataFrame([("frank", 68, "usa"), ("jordana", 26, "brasil")]).toDF("first_name", "age", "country")
df.write.format("delta").mode("append").save("/FileStore/tables/delta-examples/schema-evolution")

"""Delta Lake does not allow you to append data with mismatched schema by default. This feature is called schema enforcement. Read this blog post to learn more about Delta Lake schema enforcement https://delta.io/blog/2022-11-16-delta-lake-schema-enforcement/"""

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2218377028571714>:2[0m
[1;32m      1[0m df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([([38;5;124m"[39m[38;5;124mfrank[39m[38;5;124m"[39m, [38;5;241m68[39m, [38;5;124m"[39m[38;5;124musa[39m[38;5;124m"[39m), ([38;5;124m"[39m[38;5;124mjordana[39m[38;5;124m"[39m, [38;5;241m26[39m, [38;5;124m"[39m[38;5;124mbrasil[39m[38;5;124m"[39m)])[38;5;241m.[39mtoDF([38;5;124m"[39m[38;5;124mfirst_name[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mage[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mcountry[39m[38;5;124m"[39m)
[0;32m----> 2[0m df[38;5;241m.[39mwrite[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)[38;5;241m.[39mmode([38;5;124m"[39m[38;5;124mappend[39m[38;5;124m"[39m)[38;5;241m.[39msave([38;5;

# Delta Lake schema evolution with mergeSchema set to true


In [0]:
df.write.option("mergeschema", "true").mode("append").format("delta").save("/FileStore/tables/delta-examples/schema-evolution")

In [0]:
spark.read.format("delta").load("/FileStore/tables/delta-examples/schema-evolution").show()
"""
The “missing” data in the country column for the existing data is simply marked as null when new columns are added.
"""

+----------+---+-------+
|first_name|age|country|
+----------+---+-------+
|   jordana| 26| brasil|
|     frank| 68|    usa|
|   leonard| 51|   null|
|       bob| 47|   null|
|        li| 23|   null|
+----------+---+-------+



# Delta Lake schema evolution with autoMerge

In [0]:
"""
Enable at spark job level whereas mergeSchema is applicable only for the specified df write and not other df writes
"""
spark.conf.set("spark.sql.delta.schema.autoMerge.enabled", "true")

In [0]:
df = spark.createDataFrame([("dahiana",), ("sabrina",)]).toDF("first_name")
df.write.format("delta").mode("append").save("/FileStore/tables/delta-examples/schema-evolution")

In [0]:
spark.read.format("delta").load("/FileStore/tables/delta-examples/schema-evolution").show()

+----------+----+-------+
|first_name| age|country|
+----------+----+-------+
|   jordana|  26| brasil|
|     frank|  68|    usa|
|   leonard|  51|   null|
|       bob|  47|   null|
|        li|  23|   null|
|   dahiana|null|   null|
|   sabrina|null|   null|
+----------+----+-------+



In [0]:
"""
Enabling Schema evolution in delta lake is similar to drift column option in Azure Mapping data flow in setting Source and Sink Transformation

Disabling Schema evolution in delta lake is similar to Validate schema option in Azure Mapping data flow in setting Source and Sink Transformation
"""