# Delta Lake Schema Enforcement

In [1]:
import pyspark
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1cdf5d48-96c8-40d1-bdfb-c4512b323ec7;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 338ms :: artifacts dl 13ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

22/11/15 06:17:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Parquet tables don't have schema enforcement

In [4]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

                                                                                

In [5]:
df.write.format("parquet").save("tmp/parquet_table1")

                                                                                

In [6]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [7]:
df.write.mode("append").format("parquet").save("tmp/parquet_table1")

In [9]:
spark.read.format("parquet").load("tmp/parquet_table1").show()

+----------+----+
|first_name| age|
+----------+----+
|   leonard|  51|
|       cat|null|
|       sal|null|
|       bob|  47|
|        li|  23|
+----------+----+



In [10]:
spark.read.option("mergeSchema", "true").format("parquet").load(
    "tmp/parquet_table1"
).show()

+----------+----+--------------+
|first_name| age|favorite_color|
+----------+----+--------------+
|   leonard|  51|          null|
|       cat|null|          pink|
|       sal|null|           red|
|       bob|  47|          null|
|        li|  23|          null|
+----------+----+--------------+



## Parquet tables with metastore have schema enforcement

In [4]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

                                                                                

In [5]:
df.write.format("parquet").saveAsTable("mystery_table")

                                                                                

In [6]:
spark.table("mystery_table").show()

+----------+---+
|first_name|age|
+----------+---+
|   leonard| 51|
|       bob| 47|
|        li| 23|
+----------+---+



In [7]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [8]:
df.write.mode("append").format("parquet").saveAsTable("mystery_table")

AnalysisException: cannot resolve 'age' given input columns: [first_name, favorite_color]

In [9]:
spark.table("mystery_table").show()

+----------+---+
|first_name|age|
+----------+---+
|   leonard| 51|
|       bob| 47|
|        li| 23|
+----------+---+



In [11]:
df.write.option("mergeSchema", "true").mode("append").format("parquet").saveAsTable(
    "mystery_table"
)

AnalysisException: cannot resolve 'age' given input columns: [first_name, favorite_color]

## Delta Lake schema enforcement is built-in

In [11]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [12]:
df.write.format("delta").save("tmp/delta_table1")

                                                                                

In [13]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [14]:
df.write.mode("append").format("delta").save("tmp/delta_table1")

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 1f0df7a5-dda6-494f-99bc-4732d455db0b).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- first_name: string (nullable = true)
-- age: long (nullable = true)


Data schema:
root
-- first_name: string (nullable = true)
-- favorite_color: string (nullable = true)

         

## Set mergeSchema to true

In [15]:
df.write.option("mergeSchema", "true").mode("append").format("delta").save(
    "tmp/delta_table1"
)

                                                                                

In [16]:
spark.read.format("delta").load("tmp/delta_table1").show()

+----------+----+--------------+
|first_name| age|favorite_color|
+----------+----+--------------+
|   leonard|  51|          null|
|       cat|null|          pink|
|       sal|null|           red|
|       bob|  47|          null|
|        li|  23|          null|
+----------+----+--------------+



## Set autoMerge to true

In [17]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

In [18]:
spark.conf

<pyspark.sql.conf.RuntimeConfig at 0x16a06aa90>

**Important** This property does not impact the need to set mergeSchema when reading Parquet!

In [23]:
spark.read.format("parquet").load("tmp/parquet_table1").show()

+----------+----+
|first_name| age|
+----------+----+
|   leonard|  51|
|       cat|null|
|       sal|null|
|       bob|  47|
|        li|  23|
+----------+----+



In [24]:
spark.read.option("mergeSchema", "true").format("parquet").load(
    "tmp/parquet_table1"
).show()

+----------+----+--------------+
|first_name| age|favorite_color|
+----------+----+--------------+
|   leonard|  51|          null|
|       cat|null|          pink|
|       sal|null|           red|
|       bob|  47|          null|
|        li|  23|          null|
+----------+----+--------------+



In [20]:
columns = ["first_name", "country"]
data = [("bill", "usa"), ("xi", "china")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [21]:
df.write.mode("append").format("delta").save("tmp/delta_table1")

                                                                                

In [22]:
spark.read.format("delta").load("tmp/delta_table1").show()

+----------+----+--------------+-------+
|first_name| age|favorite_color|country|
+----------+----+--------------+-------+
|   leonard|  51|          null|   null|
|       cat|null|          pink|   null|
|       sal|null|           red|   null|
|       bob|  47|          null|   null|
|      bill|null|          null|    usa|
|        xi|null|          null|  china|
|        li|  23|          null|   null|
+----------+----+--------------+-------+

