What is Delta Lake?

Key Concepts to Teach:

- Built on top of Parquet
- Supports ACID, schema enforcement, versioning
- Works best on Databricks and supports MERGE, UPDATE, DELETE

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Spark DataFrames').getOrCreate()

In [0]:

data = [("101", "Alice", "2023-11-01", 1000), ("102", "Bob", "2023-11-01", 2000)]
cols = ["order_id", "customer", "order_date", "amount"]

df = spark.createDataFrame(data, cols)
display(df)

In [0]:
%sql
CREATE VOLUME workspace.default.tmp
COMMENT 'This is my example managed volume';

In [0]:
dbutils.fs.mkdirs("/Volumes/workspace/default/tmp")

In [0]:
df.write.format('delta').mode('overwrite').save('/Volumes/workspace/default/tmp/bronze_sales')

In [0]:
display(dbutils.fs.ls("/Volumes/workspace/default/tmp/bronze_sales/_delta_log"))


In [0]:
dbutils.fs.head('/Volumes/workspace/default/tmp/bronze_sales/_delta_log/00000000000000000000.json')

### Layers in Architecture (Medallion):
- Bronze ➝ Raw data
- Silver ➝ Cleaned + enriched data
- Gold ➝ Aggregated insights

#### Build Medallion Architecture (Practical)

🔶 Bronze – Ingest Raw CSV

In [0]:
#bronze layer - Ingest from RAW CSV 

df_raw = spark.read.option("header", True).csv("/databricks-datasets/retail-org/customers/")
df_raw.write.format("delta").mode("overwrite").save("/Volumes/workspace/default/tmp/lakehouse/bronze/customers")
display(df_raw)

🔷 Silver - Clean and Standard

In [0]:
# Silver - Clean and Standard 
df_bronze = spark.read.format("delta").load("/Volumes/workspace/default/tmp/lakehouse/bronze/customers")

# Display the schema to check the column names
df_bronze.printSchema()

from pyspark.sql.functions import col, lower, trim

# Assuming the correct column name is 'customer_email' instead of 'email'
df_silver = df_bronze.withColumn("customer_name", lower(trim(col("customer_name")))) \
                     .dropna(subset=["customer_id", "customer_name"])

df_silver.write.format("delta").mode("overwrite").save("/Volumes/workspace/default/tmp/lakehouse/silver/customers")

display(df_silver)

🟡 Gold – Aggregated Insights

In [0]:
df_silver = spark.read.format("delta").load("/Volumes/workspace/default/tmp/lakehouse/silver/customers")

df_gold = df_silver.groupBy("state").count()
df_gold.write.format("delta").mode("overwrite").saveAsTable("customer_summary")

In [0]:
%sql
select * from customer_summary 

In [0]:
%sql
describe history customer_summary