# Delta Lake Schema Evolution

In [1]:
import delta
import pyspark
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6c2c5712-45e1-473f-ac20-f44d05adae26;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 119ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

## Create Delta table

In [4]:
df = spark.createDataFrame([("bob", 47), ("li", 23), ("leonard", 51)]).toDF(
    "first_name", "age"
)

In [5]:
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+---+
|first_name|age|
+----------+---+
|       bob| 47|
|        li| 23|
|   leonard| 51|
+----------+---+



                                                                                

In [6]:
df.write.format("delta").save("tmp/fun_people")

23/08/09 14:46:38 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## Can't append mismatched schema by default because of schema enforcement

In [7]:
df = spark.createDataFrame([("frank", 68, "usa"), ("jordana", 26, "brasil")]).toDF(
    "first_name", "age", "country"
)

In [8]:
df.show()

+----------+---+-------+
|first_name|age|country|
+----------+---+-------+
|     frank| 68|    usa|
|   jordana| 26| brasil|
+----------+---+-------+



In [9]:
df.write.format("delta").mode("append").save("tmp/fun_people")

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 0d4b3806-5c6e-46a2-8b74-a209eb1777fe).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- first_name: string (nullable = true)
-- age: long (nullable = true)


Data schema:
root
-- first_name: string (nullable = true)
-- age: long (nullable = true)
-- country: string (nullable = true)

         

## Enable schema evolution with mergeSchema set to true

In [10]:
df.write.option("mergeSchema", "true").mode("append").format("delta").save(
    "tmp/fun_people"
)

In [11]:
spark.read.format("delta").load("tmp/fun_people").show()

+----------+---+-------+
|first_name|age|country|
+----------+---+-------+
|   jordana| 26| brasil|
|     frank| 68|    usa|
|   leonard| 51|   null|
|       bob| 47|   null|
|        li| 23|   null|
+----------+---+-------+



## Enable schema evolution with autoMerge

In [18]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

In [19]:
df = spark.createDataFrame([("dahiana",), ("sabrina",)]).toDF("first_name")

In [20]:
df.write.format("delta").mode("append").save("tmp/fun_people")

                                                                                

In [21]:
spark.read.format("delta").load("tmp/fun_people").show()

+----------+----+-------+
|first_name| age|country|
+----------+----+-------+
|   jordana|  26| brasil|
|     frank|  68|    usa|
|   leonard|  51|   null|
|       bob|  47|   null|
|        li|  23|   null|
|   sabrina|null|   null|
|   dahiana|null|   null|
+----------+----+-------+



## Very permissive schema evolution is allowed

In [26]:
df = spark.range(0, 3)

In [25]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+



In [23]:
df.write.format("delta").mode("append").save("tmp/fun_people")

                                                                                

In [24]:
spark.read.format("delta").load("tmp/fun_people").show()

+----------+----+-------+----+
|first_name| age|country|  id|
+----------+----+-------+----+
|   jordana|  26| brasil|null|
|     frank|  68|    usa|null|
|   leonard|  51|   null|null|
|       bob|  47|   null|null|
|        li|  23|   null|null|
|   sabrina|null|   null|null|
|   dahiana|null|   null|null|
|      null|null|   null|   1|
|      null|null|   null|   2|
|      null|null|   null|   0|
+----------+----+-------+----+



## "Schema evolution" on Parquet data lakes

In [11]:
df = spark.createDataFrame([("delhi", "india"), ("baltimore", "usa")]).toDF(
    "city", "country"
)

In [12]:
df.write.format("parquet").mode("append").save("tmp/some_cities")

In [13]:
spark.read.format("parquet").load("tmp/some_cities").show()

+---------+-------+
|     city|country|
+---------+-------+
|baltimore|    usa|
|    delhi|  india|
+---------+-------+



In [14]:
df = spark.range(0, 3)

In [15]:
df.write.format("parquet").mode("append").save("tmp/some_cities")

In [16]:
spark.read.format("parquet").load("tmp/some_cities").show()

+---------+-------+
|     city|country|
+---------+-------+
|baltimore|    usa|
|    delhi|  india|
|     null|   null|
|     null|   null|
|     null|   null|
+---------+-------+



In [17]:
spark.read.format("parquet").option("mergeSchema", "true").load(
    "tmp/some_cities"
).show()

+---------+-------+----+
|     city|country|  id|
+---------+-------+----+
|baltimore|    usa|null|
|    delhi|  india|null|
|     null|   null|   0|
|     null|   null|   1|
|     null|   null|   2|
+---------+-------+----+



In [18]:
df = spark.createDataFrame(
    [("manila", "philippines", "asia"), ("toronto", "canada", "north america")]
).toDF("city", "country", "continent")

In [19]:
df.write.format("parquet").mode("append").save("tmp/some_cities")

In [20]:
spark.read.format("parquet").load("tmp/some_cities").show()

+---------+-----------+
|     city|    country|
+---------+-----------+
|  toronto|     canada|
|   manila|philippines|
|baltimore|        usa|
|    delhi|      india|
|     null|       null|
|     null|       null|
|     null|       null|
+---------+-----------+



In [23]:
spark.read.format("parquet").load("tmp/some_cities").select("id").show()

AnalysisException: Column 'id' does not exist. Did you mean one of the following? [city, country];
'Project ['id]
+- Relation [city#1471,country#1472] parquet


In [21]:
spark.read.format("parquet").option("mergeSchema", "true").load(
    "tmp/some_cities"
).show()

+---------+-----------+----+-------------+
|     city|    country|  id|    continent|
+---------+-----------+----+-------------+
|  toronto|     canada|null|north america|
|   manila|philippines|null|         asia|
|baltimore|        usa|null|         null|
|    delhi|      india|null|         null|
|     null|       null|   0|         null|
|     null|       null|   1|         null|
|     null|       null|   2|         null|
+---------+-----------+----+-------------+



## Schema evolution for changing column types

In [24]:
df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)]).toDF(
    "letter", "number"
)

In [26]:
df.printSchema()

root
 |-- letter: string (nullable = true)
 |-- number: long (nullable = true)



In [25]:
df.write.format("delta").save("tmp/silly_data")

                                                                                

In [27]:
df = spark.createDataFrame([("d", 4.4), ("e", 5.5)]).toDF(
    "letter", "number"
)

In [28]:
df.printSchema()

root
 |-- letter: string (nullable = true)
 |-- number: double (nullable = true)



In [29]:
df.write.mode("append").format("delta").save("tmp/silly_data")

AnalysisException: Failed to merge fields 'number' and 'number'. Failed to merge incompatible data types LongType and DoubleType

In [30]:
df.write.option("mergeSchema", "true").mode("append").format("delta").save("tmp/silly_data")

AnalysisException: Failed to merge fields 'number' and 'number'. Failed to merge incompatible data types LongType and DoubleType