In [None]:
%pip install --upgrade pip

In [None]:
%pip install Faker fsspec

In [None]:
from datetime import datetime, date, timezone
import json
import random

import fsspec
from faker import Faker

# Replace with your actual Lakehouse path
LAKEHOUSE_PATH = "abfss://DP700@onelake.dfs.fabric.microsoft.com/deltademo.Lakehouse/Files/"
DB_PATH = f"{LAKEHOUSE_PATH}db/last_id.json"

db = {}

with fsspec.open(DB_PATH, 'w') as f:
    json.dump(db, f)  # Corrected line: db is dumped into f

print(f"id db saved to: {DB_PATH}") # corrected print statement to show the full file path.

In [None]:
def seed_db_key(db:dict, id_key:str)-> dict:
    db[id_key] = 0
    return db


def open_id_db(id_key:str)-> dict:
    with fsspec.open(DB_PATH, 'r') as f:
        data = json.load(f)
    if not data.get(id_key):
        data = seed_db_key(data, id_key)
        print("db seeded")
    return data

def update_last_id(last_id:int, id_key:str)-> None:
    data = open_id_db(id_key)
    data[id_key] = last_id
    with fsspec.open(DB_PATH, 'w') as f:
        json.dump(data, f)

In [None]:


def create_records_sales(number:int):
    fake = Faker()
    records = []
    last_ids = open_id_db("last_id_sales")
    last_id = last_ids["last_id_sales"]
    for i in range(last_id, last_id + number):
        first_name = fake.first_name()
        last_name = fake.last_name()
        product_name = fake.word().capitalize()
        quantity = random.randint(1, 10)
        unit_price = round(random.uniform(10, 100), 2)
        # unit_price = -1
        total_price = quantity * unit_price
        _date = fake.date_between_dates(date(2020,1,1), date.today())
        _id = i

        record = {
            'ID': _id,
            'FirstName': first_name,
            'LastName': last_name,
            'ProductName': product_name,
            'Quantity': quantity,
            'UnitPrice': unit_price,
            'TotalPrice': total_price,
            'TransactionDate': _date
        }
        records.append(record)
    update_last_id(i+1,"last_id_sales")

    return records

In [None]:
# create a table

spark.sql(
     f"""
CREATE TABLE IF NOT EXISTS sales (
    ID INT,
    FirstName STRING,
    LastName STRING,
    ProductName STRING,
    Quantity INT,
    UnitPrice DOUBLE,
    TotalPrice DOUBLE,
    TransactionDate DATE
)
USING DELTA
"""
)


In [None]:
from pyspark.sql import functions as F

# insert records

for _ in range(11):
    new_data = create_records_sales(20)

    data_to_append = spark.createDataFrame(new_data)

    data_to_append = (data_to_append
        .select(
            F.col("ID").cast("int"),
            "FirstName",
            "LastName",
            "ProductName",
            F.col("Quantity").cast("int"),
            "UnitPrice",
            "TotalPrice",
            "TransactionDate"
        )
        )

    data_to_append.write.format("delta").mode("append").saveAsTable("sales")
    print(f"added 20 records to slaes table")


# Table History

In [None]:
from delta.tables import DeltaTable

sales_delta = DeltaTable.forName(spark, "sales")

display(sales_delta.history())

# Time Travel

`version_1 = spark.read.format("delta").option("versionAsOf", 1).table("sales)`

`version_2 = spark.read.format("delta").option("versionAsOf", 2).table("sales)`

In [None]:
version_1 = spark.read.format("delta").option("versionAsOf", 1).table("sales")
version_2 = spark.read.format("delta").option("versionAsOf", 2).table("sales")

In [None]:
print(version_1.count())
print(version_2.count())

In [None]:
version_ts = spark.read.format("delta").option("timestampAsOf", "2025-04-03 07:27:38").table("sales") 


In [None]:
version_ts.count()

# Table retention (Table Properties)

## File Retention

`delta.deletedFileRetentionDuration`

## Log Retention

`delta.logRetentionDuration`



In [None]:
sales_delta = DeltaTable.forName(spark, "sales")

display(sales_delta.detail())

# File Compation (reduction)



In [None]:
numFiles = 1

(spark.read
 .format("delta")
 .table("sales")
 .repartition(numFiles)
 .write
 .option("DataChange", "false")
 .format("delta")
 .mode("overwrite")
 .saveAsTable("sales")
 )

In [None]:
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")

# Vacuum

In [None]:
sales_delta = DeltaTable.forName(spark, "sales")

sales_delta.vacuum(retentionHours = 0)

In [None]:

table = "sales"

spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = true")
spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES ('delta.deletedFileRetentionDuration' = '0 hours');")
spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES ('delta.logRetentionDuration' = 'interval 0 hours');")



In [None]:
sales_delta = DeltaTable.forName(spark, "sales")

display(sales_delta.detail())

In [None]:
%%sql

SHOW TBLPROPERTIES sales

In [None]:
sales_delta = DeltaTable.forName(spark, "sales")

sales_delta.vacuum(0)

In [None]:
# reset demo

try:
    mssparkutils.fs.rm("Files/db", True) # remove files
    print("folder removed")
except:
    print("already removed")

spark.sql("drop table sales")
print("table dropped")