In [1]:
from pyspark.sql import SparkSession
from delta import *

# Initialize Spark with Delta support
spark = SparkSession.builder \
    .appName("DeltaACID") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .master("local[*]") \
    .getOrCreate()

# Create a simple Delta table
data = [(1, "Alice", 1000), (2, "Bob", 1500)]
columns = ["id", "name", "salary"]

df = spark.createDataFrame(data, columns)
df.write.format("delta").mode("overwrite").save("/tmp/delta/employees")

# Inspect the delta log
import os
print("Delta log files:")
print(os.listdir("/tmp/delta/employees/_delta_log"))

# Read the table back
df_read = spark.read.format("delta").load("/tmp/delta/employees")
df_read.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/19 04:04:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/19 04:04:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Delta log files:
['.00000000000000000000.json.crc', '00000000000000000000.json']
+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice|  1000|
|  2|  Bob|  1500|
+---+-----+------+



In [2]:
from delta.tables import DeltaTable

# Existing table
delta_table = DeltaTable.forPath(spark, "/tmp/delta/employees")

# New updates (CDC)
updates = [(2, "Bob", 1800), (3, "Charlie", 1200)]
updates_df = spark.createDataFrame(updates, columns)

# Merge into Delta table
delta_table.alias("t").merge(
    updates_df.alias("u"),
    "t.id = u.id"
).whenMatchedUpdate(set={"salary": "u.salary"}) \
 .whenNotMatchedInsert(values={"id": "u.id", "name": "u.name", "salary": "u.salary"}) \
 .execute()

delta_table.toDF().show()

                                                                                

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  2|    Bob|  1800|
|  3|Charlie|  1200|
|  1|  Alice|  1000|
+---+-------+------+



In [3]:
# Let's view an older version
df_v0 = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta/employees")
df_v0.show()

# Rollback to version 0
delta_table.restoreToVersion(0)
delta_table.toDF().show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice|  1000|
|  2|  Bob|  1500|
+---+-----+------+



25/12/19 04:22:12 WARN DAGScheduler: Broadcasting large task binary with size 1071.3 KiB
                                                                                

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice|  1000|
|  2|  Bob|  1500|
+---+-----+------+

