# Prerequsites

## Create Catalog and Scehema

In [0]:
%sql
DROP SCHEMA  IF EXISTS quickstart_schema CASCADE;
CREATE SCHEMA quickstart_schema;

## Create Delta Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS quickstart_schema.users(
  id INT,
  name STRING,
  dob DATE,
  email STRING,
  gender STRING,
  country STRING,
  region STRING,
  city STRING,
  asset INT,
  marital_status STRING
);
DESCRIBE EXTENDED quickstart_schema.users;

col_name,data_type,comment
id,int,
name,string,
dob,date,
email,string,
gender,string,
country,string,
region,string,
city,string,
asset,int,
marital_status,string,


In [0]:
%sql
SELECT * from quickstart_schema.users;

id,name,dob,email,gender,country,region,city,asset,marital_status


## Load Data into Delta Table

In [0]:
spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_001.csv",
    header=True,
    inferSchema=True,
).write.mode("overwrite").saveAsTable("quickstart_schema.users")

# Read delta table
spark.read.table("quickstart_schema.users").limit(4).display()

id,name,dob,email,gender,country,region,city,asset,marital_status
1,Heather Gibbs,2024-10-31,heathergibbs6243@gmail.com,Female,United States,Virginia,Virginia Beach,734388,Married
2,Herrod Petersen,2024-02-19,herrodpetersen@yahoomail.com,Male,United States,Arizona,Phoenix,113506,Single
3,Ocean Workman,2024-10-10,oceanworkman2328@ymail.com,Male,United States,Tennessee,Clarksville,139985,Married
4,Xaviera Maxwell,2025-03-09,xavieramaxwell@gmail.com,Transgender,United States,Ohio,Cleveland,511409,Married


# Schema Enforcement - Source with more columns

> Schema enforcement, also known as schema validation, is a safeguard in Delta Lake that ensures data quality by rejecting writes to a table that do not match the table's schema.

**Reference**

https://www.databricks.com/blog/2019/09/24/diving-into-delta-lake-schema-enforcement-evolution.html



In [0]:
spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_006_new_column_education.csv",
    header=True,
    inferSchema=True,
).write.mode("append").saveAsTable("quickstart_schema.users")
spark.read.table("quickstart_schema.users").limit(4).display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2692471806036561>:1[0m
[0;32m----> 1[0m spark[38;5;241m.[39mread[38;5;241m.[39mcsv(
[1;32m      2[0m     path[38;5;241m=[39m[38;5;124m"[39m[38;5;124mdbfs:/FileStore/synechron/user_dataset/users_006_new_column_education.csv[39m[38;5;124m"[39m,
[1;32m      3[0m     header[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m      4[0m     inferSchema[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m      5[0m )[38;5;241m.[39mwrite[38;5;241m.[39mmode([38;5;124m"[39m[38;5;124mappend[39m[38;5;124m"[39m)[38;5;241m.[39msaveAsTable([38;5;124m"[39m[38;5;124mquickstart_schema.users[39m[38;5;124m"[39m)
[1;32m      6[0m spark[38;5;241m.[39mread[38;5;241m.[39mtable([38;5;124m"[39m[38;5;124mquickstart_schema.users[39m[38;5;124m"[39m)[38;5;241m.[39mlimit([38

# Schema Evolution -  Merge Schema

In [0]:
spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_006_new_column_education.csv",
    header=True,
    inferSchema=True,
).write.option("mergeSchema", "true").mode("append").saveAsTable(
    "quickstart_schema.users"
)

# Read Delta table
spark.read.table("quickstart_schema.users").limit(4).display()

id,name,dob,email,gender,country,region,city,asset,marital_status,education
1,Heather Gibbs,2024-10-31,heathergibbs6243@gmail.com,Female,United States,Virginia,Virginia Beach,734388,Married,
2,Herrod Petersen,2024-02-19,herrodpetersen@yahoomail.com,Male,United States,Arizona,Phoenix,113506,Single,
3,Ocean Workman,2024-10-10,oceanworkman2328@ymail.com,Male,United States,Tennessee,Clarksville,139985,Married,
4,Xaviera Maxwell,2025-03-09,xavieramaxwell@gmail.com,Transgender,United States,Ohio,Cleveland,511409,Married,


In [0]:
%sql
DESCRIBE FORMATTED quickstart_schema.users;

col_name,data_type,comment
id,int,
name,string,
dob,date,
email,string,
gender,string,
country,string,
region,string,
city,string,
asset,int,
marital_status,string,


In [0]:
from pyspark.sql.functions import col
spark.read.table("quickstart_schema.users").filter(col("id")<5).limit(4).display()

id,name,dob,email,gender,country,region,city,asset,marital_status,education
1,Heather Gibbs,2024-10-31,heathergibbs6243@gmail.com,Female,United States,Virginia,Virginia Beach,734388,Married,
2,Herrod Petersen,2024-02-19,herrodpetersen@yahoomail.com,Male,United States,Arizona,Phoenix,113506,Single,
3,Ocean Workman,2024-10-10,oceanworkman2328@ymail.com,Male,United States,Tennessee,Clarksville,139985,Married,
4,Xaviera Maxwell,2025-03-09,xavieramaxwell@gmail.com,Transgender,United States,Ohio,Cleveland,511409,Married,


# Source with less columns

In [0]:
spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_012_less_columns.csv",
    header=True,
    inferSchema=True,
).write.mode("append").saveAsTable("quickstart_schema.users")

spark.read.table("quickstart_schema.users").filter(col("id") > 5501).limit(4).display()

id,name,dob,email,gender,country,region,city,asset,marital_status,education
5502,Thaddeus Bradley,,,,,,,,,
5503,Brandon Randall,,,,,,,,,
5504,Duncan Valdez,,,,,,,,,
5505,Azalia Montgomery,,,,,,,,,


# Source with different column names

In [0]:

spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_013_less_columns_with_different_column_names.csv",
    header=True,
    inferSchema=True,
).write.mode("append").saveAsTable("quickstart_schema.users")

spark.read.table("quickstart_schema.users").filter(col("id") >= 6001).limit(4).display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2692471806036570>:1[0m
[0;32m----> 1[0m spark[38;5;241m.[39mread[38;5;241m.[39mcsv(
[1;32m      2[0m     path[38;5;241m=[39m[38;5;124m"[39m[38;5;124mdbfs:/FileStore/synechron/user_dataset/users_013_less_columns_with_different_column_names.csv[39m[38;5;124m"[39m,
[1;32m      3[0m     header[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m      4[0m     inferSchema[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m      5[0m )[38;5;241m.[39mwrite[38;5;241m.[39mmode([38;5;124m"[39m[38;5;124mappend[39m[38;5;124m"[39m)[38;5;241m.[39msaveAsTable([38;5;124m"[39m[38;5;124mquickstart_schema.users[39m[38;5;124m"[39m)
[1;32m      7[0m spark[38;5;241m.[39mread[38;5;241m.[39mtable([38;5;124m"[39m[38;5;124mquickstart_schema.users[39m[38;5;124m"[39m)[38;5;

# Source with different data types

**Note**
Cannot Merge Incompatible  Schema

In [0]:

spark.read.csv(
    path="dbfs:/FileStore/synechron/user_dataset/users_012_datatype_mismatch_dob.csv",
    header=True,
    inferSchema=True,
).write.mode("append").saveAsTable("quickstart_schema.users")

spark.read.table("quickstart_schema.users").filter(col("id") == 5501).display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-13052135826420>:1[0m
[0;32m----> 1[0m spark[38;5;241m.[39mread[38;5;241m.[39mcsv(
[1;32m      2[0m     path[38;5;241m=[39m[38;5;124m"[39m[38;5;124mdbfs:/FileStore/synechron/user_dataset/users_012_datatype_mismatch_dob.csv[39m[38;5;124m"[39m,
[1;32m      3[0m     header[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m      4[0m     inferSchema[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m      5[0m )[38;5;241m.[39mwrite[38;5;241m.[39mmode([38;5;124m"[39m[38;5;124mappend[39m[38;5;124m"[39m)[38;5;241m.[39msaveAsTable([38;5;124m"[39m[38;5;124mquickstart_schema.users[39m[38;5;124m"[39m)
[1;32m      7[0m spark[38;5;241m.[39mread[38;5;241m.[39mtable([38;5;124m"[39m[38;5;124mquickstart_schema.users[39m[38;5;124m"[39m)[38;5;241m.[39mfilter(col(