In [None]:
sc.addPyFile('https://raw.githubusercontent.com/microsoft/FabricBQSync/main/Packages/FabricSync/FabricSync/DeltaStorageInventory.py')

In [None]:
inventory_dt = None #Corresponds to the blob inventory snapshot, defaults to current day
inventory_rule = "Demo" #Blob Inventory rule name
container = "demodata" #Scoped container for the Blob Inventory
output_type = "csv" #Blob Inventory Output Type

storage_prefix = "Files/demodata/" #Mount to storage container for delta tables
lakehouse = "DeltaInventory" #Lakehouse or Schema to store the inventory output in
inventory_path = "Files/demodata/" #Mount to storage container for blob inventory files
parallelism = 5
track_history = False #Use if you want to analyze delta inventory over time

In [None]:
from notebookutils import mssparkutils
from DeltaStorageInventory import *

In [None]:
try:
    mssparkutils.lakehouse.get(lakehouse)
except:
    print (f"Creating {lakehouse} Lakehouse...")
    mssparkutils.lakehouse.create(lakehouse)

In [None]:
delta_inventory = DeltaStorageInventory(session= spark, \
    storage_prefix=storage_prefix, 
    container=container,
    target_lakehouse=lakehouse, 
    parallelism=parallelism,
    track_history=track_history)

delta_inventory.run_from_storage_inventory(rule=inventory_rule, \
    inventory_data_path=inventory_path, \
    inventory_output_type=output_type)

# Example Delta Inventory Queries

In [None]:
%%sql
SELECT
  delta_table,
  array_min(delta_versions) as min_version,
  array_max(delta_versions) as max_version
FROM DeltaInventory.delta_tables

In [None]:
%%sql
SELECT *
FROM DeltaInventory.delta_table_snapshot

In [None]:
%%sql
SELECT
  delta_table,
  count(DISTINCT delta_partition) as partition_count,
  sum(files_count) as file_count,
  sum(file_size) as file_size,
  sum(removed_files_count) as removed_files_count,
  sum(removed_file_size) as removed_file_size
FROM DeltaInventory.delta_table_partitions
GROUP BY delta_table

In [None]:
%%sql
SELECT
  f.delta_table,
  f.file_info["delta_version"] as delta_version,
  h.`timestamp` as version_dt,
  SUM(CASE WHEN f.file_info["operation"] = "ADD" THEN f.file_info["file_size"] ELSE 0 END) as active_size,
  SUM(CASE WHEN f.file_info["operation"] = "REMOVE" THEN f.file_info["file_size"] ELSE 0 END) as out_of_scope_size
FROM DeltaInventory.delta_table_files f
JOIN DeltaInventory.delta_table_history h ON f.delta_table_id=h.delta_table_id 
  AND f.file_info["delta_version"] = h.version
  AND f.inventory_date=h.inventory_date
GROUP BY
  f.delta_table,
  delta_version,
  version_dt

In [None]:
%%sql
SELECT * FROM (
  SELECT
    delta_table,
    operation
  FROM delta_inventory.delta_table_history
  WHERE operation NOT IN (
    'CREATE OR REPLACE TABLE AS SELECT',
    'CREATE TABLE AS SELECT')
)
PIVOT (
  COUNT(*)
  FOR operation in (
    'MERGE',
    'WRITE',
    'VACUUM END',
    'VACUUM START',
    'OPTIMIZE'
  )
)