In [0]:
"""
https://delta.io/blog/remove-files-delta-lake-vacuum-command/
https://github.com/delta-io/delta-examples/blob/master/notebooks/pyspark/vacuum.ipynb
"""

"""
Delta tables can be saved in a variety of storage systems like HDFS, AWS S3, or Azure Blob Storage. These storage systems usually cost money.

If your Delta table contains a lot of tombstoned files that you don't need because you're not time traveling beyond the retention period, then you can just vacuum the tombstoned files and save on storage costs.

Vacuuming obviously isn't viable in all situations. You may need to keep tombstoned files so you can time travel, for regulatory requirements, or for audit data retention. Vacuuming is a great way to save on storage costs, but it's not always desirable.
"""

# Delta Lake vacuum example


In [0]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")
df.repartition(1).write.format("delta").saveAsTable("vacuum_example")

In [0]:
df = spark.createDataFrame([("ingrid", 58), ("luisa", 87)]).toDF("first_name", "age")
df.repartition(1).write.format("delta").mode("append").saveAsTable("vacuum_example")

In [0]:
spark.table("vacuum_example").show()

+----------+---+
|first_name|age|
+----------+---+
|    ingrid| 58|
|     luisa| 87|
|       bob|  3|
|       sue|  5|
+----------+---+



# First pass at vacuum

In [0]:
spark.sql("VACUUM vacuum_example DRY RUN")

Out[10]: DataFrame[path: string]

# Overwrite Delta table to tombstone files


In [0]:
df = spark.createDataFrame([("jordana", 26), ("fred", 25)]).toDF("first_name", "age")
df.repartition(1).write.format("delta").mode("overwrite").saveAsTable("vacuum_example")
spark.table("vacuum_example").show()


+----------+---+
|first_name|age|
+----------+---+
|   jordana| 26|
|      fred| 25|
+----------+---+



In [0]:
spark.sql("VACUUM vacuum_example DRY RUN") # JDBC prj had 7 days of vacuum (24*7=168hr)

Out[12]: DataFrame[path: string]

In [0]:
spark.sql("VACUUM vacuum_example RETAIN 0 HOURS DRY RUN")

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-2218377028571736>:1[0m
[0;32m----> 1[0m [43mspark[49m[38;5;241;43m.[39;49m[43msql[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mVACUUM some_people RETAIN 0 HOURS DRY RUN[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, clas


# DISABLE RETENTION DURATION CHECK (DON'T DO THIS IN PROD)


In [0]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [0]:
spark.sql("VACUUM vacuum_example RETAIN 0 HOURS DRY RUN").show(truncate=False)

+------------------------------------------------------------------------------------------------------------+
|path                                                                                                        |
+------------------------------------------------------------------------------------------------------------+
|dbfs:/user/hive/warehouse/vacuum_example/part-00000-4bd594ac-6716-4ede-9801-5bc1783f3ce1-c000.snappy.parquet|
|dbfs:/user/hive/warehouse/vacuum_example/part-00000-93326e8e-b633-4016-9c0e-82107ec35d96-c000.snappy.parquet|
+------------------------------------------------------------------------------------------------------------+



In [0]:
spark.sql("VACUUM vacuum_example RETAIN 0 HOURS")

Out[17]: DataFrame[path: string]

# Vacuuming limits ability to time travel


In [0]:
spark.sql("SELECT * FROM vacuum_example").show()

+----------+---+
|first_name|age|
+----------+---+
|   jordana| 26|
|      fred| 25|
+----------+---+



In [0]:
spark.sql("SELECT * FROM vacuum_example VERSION AS OF 2").show()

+----------+---+
|first_name|age|
+----------+---+
|   jordana| 26|
|      fred| 25|
+----------+---+



In [0]:
spark.sql("SELECT * FROM vacuum_example VERSION AS OF 1").show()
"""
versionAsOf 1 is physically removed by vacuum command, can't time travel
"""

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2218377028571744>:1[0m
[0;32m----> 1[0m [43mspark[49m[38;5;241;43m.[39;49m[43msql[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mSELECT * FROM vacuum_example VERSION AS OF 1[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241;43m.[39;49m[43mshow[49m[43m([49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.