In [1]:
from delta.tables import DeltaTable

table_path = "/opt/spark/work-dir/hitchhikers_guide/datasets/ecomm_behavior_data/delta/ecomm/"

# using an existing table (we must not screw this up!)
dt = DeltaTable.forPath(spark, table_path)

# Governance 101
> We will walk through a simple method to automatically retain Delta tables for N-days
> It is also worth pointing out that in the example, the default state is "off". This is a saftey precausion given that not all "tables" will have the same retention interval, and it is best to ensure that "we can" test drive before flipping any of these table based feature-flags.

In [2]:
from pyspark.sql.functions import lit, col, make_interval, make_dt_interval

> Note: Given that there is a local hive-metastore running, the first time this runs might take a bit to on-demand get all the jars.

In [None]:
spark.sql(f"""
ALTER TABLE delta.`{table_path}`
SET TBLPROPERTIES (
  'catalog.table.gov.retention.enabled'='true',
  'catalog.table.gov.retention.date_col'='event_date',
  'catalog.table.gov.retention.policy'='interval 28 days'
)
"""
)

## How to Fetch the Governance Properties
1. We need to understand if "governance" is enabled on the table
2. If it is enabled, then we need to fetch the retention policy, or default to a sane retention (90 days in the example)

In [None]:
props = dt.detail().first()['properties']
table_retention_enabled = bool(props.get('catalog.table.gov.retention.enabled', 'false'))
table_retention_policy = props.get('catalog.table.gov.retention.policy', 'interval 90 days')

## Generate a Function to parse the `interval` string
> The following `convert_to_interval` method is simple for the example. This could easily be a PySpark UDF that takes a Column of `StringType` to extract the interval and convert to the catalyst `IntervalType`.

In [6]:
import re
from pyspark.sql.functions import lit, col, make_interval, make_dt_interval
def convert_to_interval(interval: str):
    """
    Supports extraction to make_dt_interval([days, hours, mins, secs])
    note: secs is a decimal, this function only uses secs for simplicity
    """
    target = str.lower(interval).lstrip()
    target = target.replace("interval", "").lstrip() if target.startswith("interval") else target
    number, interval_type = re.split("\s+", target)
    amount = int(number)
    dt_interval = [None, None, None, None]
    if interval_type == "days":
        dt_interval[0] = lit(364 if amount > 365 else amount)
    elif interval_type == "hours":
        dt_interval[1] = lit(23 if amount > 24 else amount)
    elif interval_type == "mins":
        dt_interval[2] = lit(59 if amount > 60 else amount)
    elif interval_type == "secs":
        dt_interval[3] = lit(59 if amount > 60 else amount)
    else:
        raise RuntimeException(f"Unknown interval_type {interval_type}")
    
    return make_dt_interval(
        days=dt_interval[0],
        hours=dt_interval[1],
        mins=dt_interval[2],
        secs=dt_interval[3]
    )

In [9]:
from pyspark.sql.functions import to_date

interval = convert_to_interval(table_retention_policy)

rules = (
    spark.sql("select current_timestamp() as now")
    .withColumn("retention_interval", interval)
    .withColumn("retain_after", to_date((col("now")-col("retention_interval"))))
)

rules.show(truncate=False)

+--------------------------+------------------------------------+------------+
|now                       |retention_interval                  |retain_after|
+--------------------------+------------------------------------+------------+
|2024-06-03 19:19:33.554395|INTERVAL '28 00:00:00' DAY TO SECOND|2024-05-06  |
+--------------------------+------------------------------------+------------+



### Where to Go Next
Now that we've created the rules engine to take reusable "intervals" from our Delta tables, we can use `conditional` delete's to ensure we only retain the correct amount of data in our tables.

> Note: This pattern works for timeseries tables. In the case where there is "timeless" data, this pattern won't do much good :) 