In [0]:
# Delta automatically keeps versions whenever data changes.


**Creating New data for the increamental**

In [0]:
from pyspark.sql.functions import col, current_timestamp
update_df =\
    spark.table("workspace.default.combined_data_as_delta_table")\
    .limit(5)\
    .withColumn("price", col("price")+10)\
    .withColumn("updated at", current_timestamp())

In [0]:
update_df.show()

+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|          updated at|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+--------------------+
|2019-11-17 08:43:14|      view|  19300100|2053013566033757167|appliances.ironin...|    nika| 27.99|536820725|de103474-7fda-4d4...|2026-01-13 09:32:...|
|2019-11-17 08:43:23|      view|   3701134|2053013565983425517|appliances.enviro...|   bosch|100.07|541580147|aea52352-a464-448...|2026-01-13 09:32:...|
|2019-11-17 08:43:28|      view|   3800499|2053013566176363511|     appliances.iron|scarlett|100.07|542213615|fa70d9e8-ea39-f22...|2026-01-13 09:32:...|
|2019-11-17 08:43:33|      view|  12500527|2053013556277805513|                NUL

**Performing MERGE (UPSERT)**\
**If record exists → UPDATE \
  If record doesn’t exist → INSERT**

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, "combined_data_as_delta_table")

deltaTable.alias("t").merge(
    update_df.alias("s"),
    "t.product_id = s.product_id AND "
    "t.user_id = s.user_id AND "
    "t.event_time = s.event_time"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

    

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

**Version**

In [0]:
%sql
DESCRIBE HISTORY combined_data_as_delta_table;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2026-01-13T09:51:18.000Z,75364571398345,mdaatifraza5657@gmail.com,MERGE,"Map(predicate -> [""(((product_id#13259 = product_id#13181) AND (user_id#13264 = user_id#13186)) AND (event_time#13257 = event_time#13179))""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(96192164494631),0113-091026-l8glu7dz-v2n,3.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 2980, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 5, executionTimeMs -> 14807, materializeSourceTimeMs -> 472, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 7514, numTargetRowsUpdated -> 5, numOutputRows -> 5, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 5, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 6613)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
3,2026-01-12T11:48:00.000Z,75364571398345,mdaatifraza5657@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1043567443557165),0112-114150-88szcmvy-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 128, numRemovedFiles -> 111, numRemovedBytes -> 3856979937, numDeletionVectorsRemoved -> 0, numOutputRows -> 109759528, numOutputBytes -> 5997731751)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2026-01-12T11:46:19.000Z,75364571398345,mdaatifraza5657@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1043567443557165),0112-114150-88szcmvy-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 111, numRemovedFiles -> 128, numRemovedBytes -> 5997731751, numDeletionVectorsRemoved -> 0, numOutputRows -> 109950743, numOutputBytes -> 3856979937)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-12T07:06:21.000Z,75364571398345,mdaatifraza5657@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1043567443557165),0112-050410-1k9tars3-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 128, numRemovedFiles -> 111, numRemovedBytes -> 3856979937, numDeletionVectorsRemoved -> 0, numOutputRows -> 109759528, numOutputBytes -> 5997731751)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-12T06:28:50.000Z,75364571398345,mdaatifraza5657@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1043567443557165),0112-050410-1k9tars3-v2n,,WriteSerializable,False,"Map(numFiles -> 111, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 109950743, numOutputBytes -> 3856979937)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


**Reading older version (Original data without any operation)**

In [0]:
%sql
SELECT *
FROM combined_data_as_delta_table
VERSION AS OF 0
LIMIT 5;


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01T00:00:00.000Z,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
2019-10-01T00:00:00.000Z,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01T00:00:01.000Z,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01T00:00:01.000Z,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-01T00:00:04.000Z,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


**OPTIMIZE combines many small files into fewer large files.


OPTIMIZE------------	Merge small files \
ZORDER--------------	Group related data \
Benefit--------------	Faster filtered queries**

In [0]:
%sql
OPTIMIZE combined_data_as_delta_table;

path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, null, null, 0, 0, 129, 129, true, 0, 0, 1768298479836, 1768298482137, 8, 0, null, List(0, 0), null, 9, 9, 0, 0, null)"


In [0]:
%sql
OPTIMIZE combined_data_as_delta_table
ZORDER BY(product_id);

path,metrics
,"List(89, 129, List(51956934, 69879312, 6.146334683146068E7, 89, 5470237868), List(2980, 47016469, 4.649406768217054E7, 129, 5997734731), 0, List(minCubeSize(107374182400), List(0, 0), List(129, 5997734731), 0, List(129, 5997734731), 1, null), null, 0, 1, 129, 0, false, 0, 0, 1768299277134, 1768299336072, 8, 1, null, List(1, 5), null, 9, 9, 201738, 0, null)"


**By Default\
Retention = 7 days\
Files newer than 7 days ---> KEPT\
Files older than 7 days ---> DELETED**

In [0]:
spark.sql("VACUUM combined_data_as_delta_table")

DataFrame[path: string]