In [0]:
"""
https://delta.io/blog/2023-01-25-delta-lake-small-file-compaction-optimize/
Small files are problematic because they cause data queries to run slowly. They require a lot of I/O overhead, which is computationally expensive. They also create large metadata transaction logs which cause planning time slowness. Delta Lake provides an OPTIMIZE command that lets users compact the small files into larger files, so their queries are not burdened by the small file overhead.

Syntax:
delta_table = delta.DeltaTable.forPath(spark, "path/to/table")
delta_table.optimize().executeCompaction()
"""

#Delta Lake small file compaction with OPTIMIZE example


In [0]:
df = spark.range(0, 5)
df.repartition(5).write.format("delta").save("/FileStore/tables/delta-example/optimize-example")

In [0]:
dbutils.fs.ls("/FileStore/tables/delta-example/optimize-example/_delta_log/00000000000000000000.json")

Out[5]: [FileInfo(path='dbfs:/FileStore/tables/delta-example/optimize-example/_delta_log/00000000000000000000.json', name='00000000000000000000.json', size=2168, modificationTime=1744384708000)]

In [0]:
display(spark.read.format("json").load("/FileStore/tables/delta-example/optimize-example/_delta_log/00000000000000000000.json"))

add,commitInfo,metaData,protocol
,"List(0411-144935-4ypuffbf, Databricks-Runtime/12.2.x-scala2.12, true, WriteSerializable, List(971687235761214), WRITE, List(3, 1816, 5), List(ErrorIfExists, []), 1744384707837, 56b4d522-5383-4845-bec9-1b4917594ad5, 7249506876114102, nawatheynupoor1990@gmail.com)",,
,,,"List(1, 2)"
,,"List(1744384706251, List(parquet), 822182fc-39e7-48c6-bc15-1721899ea0a2, List(), {""type"":""struct"",""fields"":[{""name"":""id"",""type"":""long"",""nullable"":true,""metadata"":{}}]})",
"List(true, 1744384708000, part-00001-813d2289-2e55-4c02-af21-71a39f6cb9f6-c000.snappy.parquet, 608, {""numRecords"":2,""minValues"":{""id"":3},""maxValues"":{""id"":4},""nullCount"":{""id"":0}}, List(1744384708000000, 1744384708000000, 1744384708000000, 268435456))",,,
"List(true, 1744384708000, part-00003-da729f55-638f-449a-841f-c6a2201e0b93-c000.snappy.parquet, 600, {""numRecords"":1,""minValues"":{""id"":1},""maxValues"":{""id"":1},""nullCount"":{""id"":0}}, List(1744384708000001, 1744384708000001, 1744384708000001, 268435456))",,,
"List(true, 1744384708000, part-00004-8003476e-e9f9-473d-90b1-1cbbc69ff1cb-c000.snappy.parquet, 608, {""numRecords"":2,""minValues"":{""id"":0},""maxValues"":{""id"":2},""nullCount"":{""id"":0}}, List(1744384708000002, 1744384708000002, 1744384708000002, 268435456))",,,


In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "/FileStore/tables/delta-example/optimize-example")

delta_table.optimize().executeCompaction()

Out[9]: DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bi

In [0]:
display(spark.read.format("json").load("/FileStore/tables/delta-example/optimize-example/_delta_log/00000000000000000000.json"))

"""
This is when the data was written for the first time
Notice dataChange: true here
"""

add,commitInfo,metaData,protocol
,"List(0411-144935-4ypuffbf, Databricks-Runtime/12.2.x-scala2.12, true, WriteSerializable, List(971687235761214), WRITE, List(3, 1816, 5), List(ErrorIfExists, []), 1744384707837, 56b4d522-5383-4845-bec9-1b4917594ad5, 7249506876114102, nawatheynupoor1990@gmail.com)",,
,,,"List(1, 2)"
,,"List(1744384706251, List(parquet), 822182fc-39e7-48c6-bc15-1721899ea0a2, List(), {""type"":""struct"",""fields"":[{""name"":""id"",""type"":""long"",""nullable"":true,""metadata"":{}}]})",
"List(true, 1744384708000, part-00001-813d2289-2e55-4c02-af21-71a39f6cb9f6-c000.snappy.parquet, 608, {""numRecords"":2,""minValues"":{""id"":3},""maxValues"":{""id"":4},""nullCount"":{""id"":0}}, List(1744384708000000, 1744384708000000, 1744384708000000, 268435456))",,,
"List(true, 1744384708000, part-00003-da729f55-638f-449a-841f-c6a2201e0b93-c000.snappy.parquet, 600, {""numRecords"":1,""minValues"":{""id"":1},""maxValues"":{""id"":1},""nullCount"":{""id"":0}}, List(1744384708000001, 1744384708000001, 1744384708000001, 268435456))",,,
"List(true, 1744384708000, part-00004-8003476e-e9f9-473d-90b1-1cbbc69ff1cb-c000.snappy.parquet, 608, {""numRecords"":2,""minValues"":{""id"":0},""maxValues"":{""id"":2},""nullCount"":{""id"":0}}, List(1744384708000002, 1744384708000002, 1744384708000002, 268435456))",,,


In [0]:
display(spark.read.format("json").load("/FileStore/tables/delta-example/optimize-example/_delta_log/00000000000000000001.json"))
"""
This is after executing optimize / compaction
Notice dataChange: false here
"""

add,commitInfo,remove
,"List(0411-144935-4ypuffbf, Databricks-Runtime/12.2.x-scala2.12, false, SnapshotIsolation, List(971687235761214), OPTIMIZE, List(618, 618, 618, 1, 0, 1816, 3, 618, 618, 618), List(false, 0, [], []), 0, 1744384949381, 091863b9-6061-416b-b332-f5f23a8da4a4, 7249506876114102, nawatheynupoor1990@gmail.com)",
,,"List(false, 1744384946805, true, part-00001-813d2289-2e55-4c02-af21-71a39f6cb9f6-c000.snappy.parquet, 608, List(1744384708000000, 1744384708000000, 1744384708000000, 268435456))"
,,"List(false, 1744384946805, true, part-00003-da729f55-638f-449a-841f-c6a2201e0b93-c000.snappy.parquet, 600, List(1744384708000001, 1744384708000001, 1744384708000001, 268435456))"
,,"List(false, 1744384946805, true, part-00004-8003476e-e9f9-473d-90b1-1cbbc69ff1cb-c000.snappy.parquet, 608, List(1744384708000002, 1744384708000002, 1744384708000002, 268435456))"
"List(false, 1744384949000, part-00000-b5c6b6da-4592-40f8-9115-1b4e3d0fadd8-c000.snappy.parquet, 618, {""numRecords"":5,""minValues"":{""id"":0},""maxValues"":{""id"":4},""nullCount"":{""id"":0}}, List(1744384708000000, 1744384708000002, 1744384708000000, 268435456))",,


In [0]:
display(spark.read.format("delta").load("/FileStore/tables/delta-example/optimize-example/"))

id
3
4
1
0
2
