In [0]:
# https://github.com/delta-io/delta-examples/blob/master/notebooks/pyspark/delta-merge.ipynb

df = spark.createDataFrame([(0, "Bob", 23), (1, "Sue", 25), (2, "Jim", 27)], ["id", "name", "age"])
df.show()

df.repartition(1).write.format("delta").save("/FileStore/tables/delta-example/merge-example")

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  1| Sue| 25|
|  2| Jim| 27|
+---+----+---+



In [0]:
new_data = [
    (0, "Bob", 23),  # exists in our original dataset above
    (3, "Sally", 30),  # new data
    (4, "Henry", 33),  # new data
]

new_df = spark.createDataFrame(new_data).toDF("id", "name", "age").repartition(1)

# Delta Lake merge with whenNotMatchedInsert


In [0]:
from delta.tables import DeltaTable

people_table = DeltaTable.forPath(spark, "/FileStore/tables/delta-example/merge-example")

people_table.alias("trgt").merge(new_df.alias("src"), "trgt.id = src.id")\
    .whenNotMatchedInsert(
        values = {
            "id": "src.id",
            "name": "src.name",
            "age": "src.age"
        }
    ).execute()

In [0]:
people_table.toDF().show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
|  3|Sally| 30|
|  4|Henry| 33|
+---+-----+---+



In [0]:
dbutils.fs.ls("/FileStore/tables/delta-example/merge-example/_delta_log/")

Out[10]: [FileInfo(path='dbfs:/FileStore/tables/delta-example/merge-example/_delta_log/.s3-optimization-0', name='.s3-optimization-0', size=0, modificationTime=1744388775000),
 FileInfo(path='dbfs:/FileStore/tables/delta-example/merge-example/_delta_log/.s3-optimization-1', name='.s3-optimization-1', size=0, modificationTime=1744388775000),
 FileInfo(path='dbfs:/FileStore/tables/delta-example/merge-example/_delta_log/.s3-optimization-2', name='.s3-optimization-2', size=0, modificationTime=1744388775000),
 FileInfo(path='dbfs:/FileStore/tables/delta-example/merge-example/_delta_log/00000000000000000000.crc', name='00000000000000000000.crc', size=2530, modificationTime=1744388778000),
 FileInfo(path='dbfs:/FileStore/tables/delta-example/merge-example/_delta_log/00000000000000000000.json', name='00000000000000000000.json', size=1523, modificationTime=1744388775000),
 FileInfo(path='dbfs:/FileStore/tables/delta-example/merge-example/_delta_log/00000000000000000001.crc', name='0000000000000

In [0]:
display(spark.read.format("json").load("/FileStore/tables/delta-example/merge-example/_delta_log/00000000000000000000.json"))

add,commitInfo,metaData,protocol
,"List(0411-144935-4ypuffbf, Databricks-Runtime/12.2.x-scala2.12, true, WriteSerializable, List(971687235761228), WRITE, List(1, 1102, 3), List(ErrorIfExists, []), 1744388774772, 1918a507-1ab7-4dfd-a1bd-a78ad2c57adc, 7249506876114102, nawatheynupoor1990@gmail.com)",,
,,,"List(1, 2)"
,,"List(1744388773284, List(parquet), a91b194b-c799-437f-a697-cc2f7a5e4cb5, List(), {""type"":""struct"",""fields"":[{""name"":""id"",""type"":""long"",""nullable"":true,""metadata"":{}},{""name"":""name"",""type"":""string"",""nullable"":true,""metadata"":{}},{""name"":""age"",""type"":""long"",""nullable"":true,""metadata"":{}}]})",
"List(true, 1744388775000, part-00000-037d7ea4-98ef-4044-bd2b-3d23f1e8421a-c000.snappy.parquet, 1102, {""numRecords"":3,""minValues"":{""id"":0,""name"":""Bob"",""age"":23},""maxValues"":{""id"":2,""name"":""Sue"",""age"":27},""nullCount"":{""id"":0,""name"":0,""age"":0}}, List(1744388775000000, 1744388775000000, 1744388775000000, 268435456))",,,


In [0]:
display(spark.read.format("json").load("/FileStore/tables/delta-example/merge-example/_delta_log/00000000000000000001.json"))

add,commitInfo
,"List(0411-144935-4ypuffbf, Databricks-Runtime/12.2.x-scala2.12, false, WriteSerializable, List(971687235761228), MERGE, List(2597, 3, 2, 3, 1097, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2495, 0), List([], [], [{""actionType"":""insert""}], [""(id#5290L = id#5278L)""]), 0, 1744389030463, d58e6fe0-3b60-488b-9dd9-8d7b0454424c, 7249506876114102, nawatheynupoor1990@gmail.com)"
"List(true, 1744389031000, part-00000-afad9dff-468d-4454-af97-bea904bc13b1-c000.snappy.parquet, 1097, {""numRecords"":2,""minValues"":{""id"":3,""name"":""Henry"",""age"":30},""maxValues"":{""id"":4,""name"":""Sally"",""age"":33},""nullCount"":{""id"":0,""name"":0,""age"":0}}, List(1744389031000000, 1744389031000000, 1744389031000000, 268435456))",


# Delta Lake merge with whenMatchedUpdate


In [0]:
new_data = [
    (4, "Henry", 34),
    (5, "Allie", 22),
]
new_df = spark.createDataFrame(new_data).toDF("id", "name", "age").repartition(1)

In [0]:
people_table.alias("trgt").merge(new_df.alias("src"), "trgt.id = src.id")\
    .whenMatchedUpdate(
        set = {
            "age": "src.age"
        }
    ).whenNotMatchedInsertAll().execute()

In [0]:
people_table.toDF().show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
|  4|Henry| 34|
|  5|Allie| 22|
|  3|Sally| 30|
+---+-----+---+



# JDBC/EMR SCD1 Implementation

In [0]:
df.show()

new_data = [
    (4, "Henry", 34),
    (5, "Allie", 22),
]
new_df = spark.createDataFrame(new_data).toDF("id", "name", "age").repartition(1)

new_df.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  1| Sue| 25|
|  2| Jim| 27|
+---+----+---+

+---+-----+---+
| id| name|age|
+---+-----+---+
|  4|Henry| 34|
|  5|Allie| 22|
+---+-----+---+



In [0]:
people_table.alias("trgt").merge(new_df.alias("src"), "trgt.id = src.id")\
    .whenMatchedUpdateAll()\
    .whenNotMatchedInsertAll()\
    .execute()


In [0]:
people_table.toDF().show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
|  4|Henry| 34|
|  5|Allie| 22|
|  3|Sally| 30|
+---+-----+---+

