In [1]:
import pyspark
from delta import *

In [4]:
# Create a Spark session with Delta extensions
build = pyspark.sql.SparkSession.builder.appName("DeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(build).getOrCreate()
print(f"Hello, Spark version {spark.version}")

Hello, Spark version 3.5.0


In [10]:
# Create a range, and save it in Delta Lake format to ensure that your Delta
# Lake extension is working

df = spark.range(0, 100)
df.write.format("delta").mode("overwrite").save("data/04_output.delta")

23/11/27 22:12:53 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/27 22:12:53 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


In [9]:
print(f"The number of partitions is: {df.rdd.getNumPartitions()}")

The number of partitions is: 16


In [17]:
# Perform enough transactions to generate a checkpoint file

for i in range(0, 10):
    # Create a dataframe
    df_row = spark.createDataFrame([pyspark.sql.Row(id=i)])

    # Append to the Delta Lake table
    df_row.write.format("delta").mode("append").save("data/04_output.delta")


23/11/27 22:41:34 ERROR NonFateSharingFuture: Failed to get result from future  
scala.runtime.NonLocalReturnControl


In [18]:
df_checkpoint = spark.read.format("parquet").load("data/04_output.delta/_delta_log/00000000000000000010.checkpoint.parquet")

In [19]:
df_checkpoint.show()

+----+--------------------+--------------------+--------+--------+--------------+
| txn|                 add|              remove|metaData|protocol|domainMetadata|
+----+--------------------+--------------------+--------+--------+--------------+
|NULL|{part-00003-c510e...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-00015-a9edd...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-00015-2a4c0...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-00000-c8a98...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-00006-d38fb...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-00015-3fce3...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-00007-26778...|                NULL|    NULL|    NULL|          NULL|
|NULL|                NULL|{part-00015-ad35d...|    NULL|    NULL|          NULL|
|NULL|{part-00010-b1802...|                NULL|    NULL|    NULL|          NULL|
|NULL|{part-0001