In [None]:
# run first. then have fun.
from pyspark.sql.functions import col
from delta.tables import DeltaTable

# keep the default compression codec as zstd
spark.conf.set("spark.sql.parquet.compression.codec", "zstd")

# common dirs, paths
dataset_dir = '/opt/spark/work-dir/hitchhikers_guide/datasets/ecomm_behavior_data'
delta_path = f"{dataset_dir}/delta"

# table information
dl_table_name = "ecomm"
dl_managed_table = f"default.{dl_table_name}"

# Intro to Delta Lake Streaming
The following section will reuse the **Delta Lake** `default.ecomm` table created during [Streaming First Steps](./streaming-first-steps.ipynb).

> note: run the following cell to check if you have the local table. You should see `[Table(name='ecomm_by_day', database='default', description=None, tableType='MANAGED', isTemporary=False)]` somewhere in the list (if you have more than one from the work in the Guide)

In [None]:
spark.catalog.setCurrentDatabase("default")
spark.catalog.listTables()

> Note: If you see `java.sql.SQLException: Failed to start database 'metastore_db' with class loader jdk.internal.loader.ClassLoaders$AppClassLoader...` then you need to detach the `kernel` from the other notebook you have open. You can only have one notebook running with the local Metastore.

# Cleaning up with Vacuum.
We are done with the introduction to Streaming. The First steps covers creating tables, and modifying the table properties, as well as understanding a little more about the structure of a Delta Lake table. During normal processing, you most likely overwrote, or deleted some data, for each transaction that affects the data in a given Delta Lake table, there are some artifacts (call it orphaned data or files) that are no longer needed for the *CURRENT* version of the Delta Lake table. We will learn more about using `vacuum` while preserving enough history to `undo`, `rewind`, or `time-travel` to a particular point in Table Time under 

In [None]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled","false")
DeltaTable.forPath(spark, f"{delta_path}/{dl_table_name}").vacuum(retentionHours=0)
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled","true")