# Delta Lake 2.3.0 Release

In [1]:
import datetime
import pathlib

import pyspark
from delta import *
from pyspark.sql import functions as F

builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-332-delta-230/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e88cc9e9-26ae-411b-a03c-cbeb51015784;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 97ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   

23/04/12 05:30:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
cwd = pathlib.Path().resolve()

## Convert Iceberg to Delta Lake

TODO: Figure out how to create an Iceberg table

## Delta Lake Shallow Clone

In [13]:
data = [(0, "Bob", 23), (1, "Sue", 25), (2, "Jim", 27)]

In [14]:
df = spark.createDataFrame(data).toDF("id", "name", "age")

In [15]:
df.write.format("delta").save("tmp/my_people")

In [16]:
spark.sql(
    f"CREATE TABLE delta.`{cwd}/tmp/my_cloned_people` SHALLOW CLONE delta.`{cwd}/tmp/my_people`"
)

DataFrame[source_table_size: bigint, source_num_of_files: bigint, num_removed_files: bigint, num_copied_files: bigint, removed_files_size: bigint, copied_files_size: bigint]

In [17]:
spark.read.format("delta").load("tmp/my_cloned_people").show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  2| Jim| 27|
|  1| Sue| 25|
+---+----+---+



In [18]:
!tree tmp/my_cloned_people

[01;34mtmp/my_cloned_people[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.checkpoint.parquet[0m
    ├── [00m00000000000000000000.json[0m
    └── [00m_last_checkpoint[0m

1 directory, 3 files


In [19]:
!tree tmp/my_people

[01;34mtmp/my_people[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [00mpart-00000-b33f79ee-473d-4b98-ad79-a8b087ae5c50-c000.snappy.parquet[0m
├── [00mpart-00003-3de8e2cd-bb73-4ed6-af3c-2d61911670d0-c000.snappy.parquet[0m
├── [00mpart-00006-206bd2a6-bc3f-47d4-9b6c-e9fb8d317732-c000.snappy.parquet[0m
└── [00mpart-00009-1c687cd5-4515-4966-9aeb-27e03bc00818-c000.snappy.parquet[0m

1 directory, 5 files


In [20]:
data = [
    (0, "Clare", 5),
]

In [21]:
df = spark.createDataFrame(data).toDF("id", "name", "age")

In [22]:
df.write.format("delta").mode("append").save("tmp/my_people")

In [23]:
spark.read.format("delta").load("tmp/my_people").show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|Clare|  5|
|  2|  Jim| 27|
|  0|  Bob| 23|
|  1|  Sue| 25|
+---+-----+---+



In [24]:
spark.read.format("delta").load("tmp/my_cloned_people").show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  2| Jim| 27|
|  1| Sue| 25|
+---+----+---+



In [25]:
data = [
    (0, "Linda", 32),
]

In [26]:
df = spark.createDataFrame(data).toDF("id", "name", "age")

In [27]:
df.write.format("delta").mode("append").save("tmp/my_cloned_people")

In [28]:
spark.read.format("delta").load("tmp/my_cloned_people").show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|Linda| 32|
|  0|  Bob| 23|
|  2|  Jim| 27|
|  1|  Sue| 25|
+---+-----+---+



In [29]:
!tree tmp/my_cloned_people

[01;34mtmp/my_cloned_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.checkpoint.parquet[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m_last_checkpoint[0m
├── [00mpart-00000-3c099586-316a-4e8d-b7d8-57827215ceaa-c000.snappy.parquet[0m
└── [00mpart-00009-2ff4b5d0-995f-4e86-b9ab-68fc54a2847a-c000.snappy.parquet[0m

1 directory, 6 files


## When not matched by source

* Imagine this analysis is run on April 12, 2023
* "active" customers have been last_seen in the previous 30 days

In [43]:
data = [
    (0, "Bob", 23, datetime.date(2022, 1, 2), "inactive"),  # inactive
    (1, "Sue", 25, datetime.date(2023, 4, 5), "active"),  # active
    # marked as active, but should not be active anymore
    (
        2,
        "Jim",
        27,
        datetime.date(2023, 2, 7),
        "active",
    ),
]

In [44]:
df = spark.createDataFrame(data).toDF("id", "name", "age", "last_seen", "status")

In [45]:
df.show()

+---+----+---+----------+--------+
| id|name|age| last_seen|  status|
+---+----+---+----------+--------+
|  0| Bob| 23|2022-01-02|inactive|
|  1| Sue| 25|2023-04-05|  active|
|  2| Jim| 27|2023-02-07|  active|
+---+----+---+----------+--------+



In [46]:
df.repartition(1).write.format("delta").save("tmp/customers")

In [47]:
new_data = [
    (0, "Bob", 23, datetime.date.today()),  # existing customer
    (3, "Sally", 30, datetime.date.today()),  # new customer
]

In [48]:
new_df = spark.createDataFrame(new_data).toDF("id", "name", "age", "current_date")

In [49]:
new_df.show()

+---+-----+---+------------+
| id| name|age|current_date|
+---+-----+---+------------+
|  0|  Bob| 23|  2023-04-12|
|  3|Sally| 30|  2023-04-12|
+---+-----+---+------------+



In [50]:
from delta.tables import DeltaTable

In [51]:
customers_table = DeltaTable.forPath(spark, "tmp/customers")

In [52]:
customers_table.toDF().show()

+---+----+---+----------+--------+
| id|name|age| last_seen|  status|
+---+----+---+----------+--------+
|  0| Bob| 23|2022-01-02|inactive|
|  1| Sue| 25|2023-04-05|  active|
|  2| Jim| 27|2023-02-07|  active|
+---+----+---+----------+--------+



In [53]:
(
    customers_table.alias("target")
    .merge(new_df.alias("source"), "target.id = source.id")
    .whenMatchedUpdate(
        set={"target.last_seen": "source.current_date", "target.status": "'active'"}
    )
    .whenNotMatchedInsert(
        values={
            "target.id": "source.id",
            "target.name": "source.name",
            "target.age": "source.age",
            "target.last_seen": "source.current_date",
            "target.status": "'active'",
        }
    )
    .whenNotMatchedBySourceUpdate(
        condition="target.last_seen <= (current_date() - INTERVAL '30' DAY)",
        set={"target.status": "'inactive'"},
    )
    .execute()
)

In [54]:
DeltaTable.forPath(spark, "tmp/customers").toDF().show()

+---+-----+---+----------+--------+
| id| name|age| last_seen|  status|
+---+-----+---+----------+--------+
|  0|  Bob| 23|2023-04-12|  active|
|  1|  Sue| 25|2023-04-05|  active|
|  2|  Jim| 27|2023-02-07|inactive|
|  3|Sally| 30|2023-04-12|  active|
+---+-----+---+----------+--------+



## CREATE TABLE LIKE

In [55]:
df = spark.createDataFrame(
    [(0, "Bob", "Loblaw", 23), (1, "Sue", "Grafton", None), (2, "Jim", "Carrey", 61)]
).toDF("id", "first_name", "last_name", "age")

In [56]:
df.write.format("delta").saveAsTable("default.famous_people")

In [58]:
spark.sql("select * from famous_people order by id asc").show()

+---+----------+---------+----+
| id|first_name|last_name| age|
+---+----------+---------+----+
|  0|       Bob|   Loblaw|  23|
|  1|       Sue|  Grafton|null|
|  2|       Jim|   Carrey|  61|
+---+----------+---------+----+



In [59]:
spark.sql("CREATE TABLE famous_people_duplicate like famous_people")

DataFrame[]

In [60]:
spark.sql("select * from famous_people_duplicate").show()

+---+----------+---------+---+
| id|first_name|last_name|age|
+---+----------+---------+---+
+---+----------+---------+---+



## Reading change data feed in SQL queries

## Cleanup