In [1]:
# Parameters
tables_list = "salestable,salesline"
source_workspace_id = "6b887591-99a7-4544-a8c4-80d4a784d7b1"
source_lakehouse_name = 'Contoso_D365_LH'
hours_back = 2
target_table_name = "cdf_staleness_table"
versions_bucket_size = 5
display_sample = False
# If target_table_base_path is empty, the standard tables' path will be used
target_table_base_path = ""

In [2]:
from deltalake import DeltaTable
from pyspark.sql.functions import col
from datetime import datetime, timedelta
import json
import pyarrow as pa
import pandas as pd
import numpy as np

import re



# ---------------- Helpers ----------------

def deduplicate_lakehouse_table_python(
    input_path: str,
       
):
    from deltalake import write_deltalake
    
    order_cols=("first_SinkCreatedOn", "commit_version")
    key_cols=("recid", "versionnumber")
    
    # Read all Parquet files into a single DataFrame
    print("Reading DeltaTable files...")
    df = DeltaTable(input_path).to_pandas()
   
    print(f"Loaded {len(df):,} rows.")
    
    # Deduplicate
    print("Deduplicating...")
    df = df.sort_values(list(order_cols), ascending=[True]*len(order_cols))
    dedup_df = df.drop_duplicates(subset=list(key_cols), keep="first")
    
    print(f"Deduplicated: {len(dedup_df):,} rows remain (removed {len(df)-len(dedup_df):,}).")
    
    # Overwrite original folder with deduplicated data
    print("Writing back...")
    dedup_df =  dedup_df.reset_index(drop=True)
    write_deltalake(
        target_path,
        dedup_df,   
        mode="overwrite",
        schema_mode = "overwrite",
        partition_by=["Table_Name"],  
    )

    
    print(f"Deduplication complete. Table updated at: {input_path}")
    return len(dedup_df)

def getTargetPath(target_table, target_table_base_path = ""):
     
    if target_table_base_path == "":
        tables_now = notebookutils.lakehouse.listTables(source_lakehouse_name, source_workspace_id)
        exists = any(t["name"] == target_table_name for t in tables_now)
        # Derive the Lakehouse "Tables" base path from any existing table location
        # Example existing location:
        #   abfss://<workspace-id>@onelake.dfs.fabric.microsoft.com/<lakehouse-id>/Tables/<SomeTable>
        if tables_now:
            any_loc = tables_now[0]["location"]
        elif filtered_tables:
            any_loc = filtered_tables[0]["location"]
        else:
            raise RuntimeError("Cannot infer Lakehouse Tables path (no tables found).")
        
        tables_base = any_loc.rsplit("/", 1)[0]                 # strip the trailing table name
        target_path = f"{tables_base}/{target_table_name}"       # .../Tables/cdf_staleness_table
    else:
        target_path = f"{target_table_base_path}/{target_table_name}"
    return target_path

def writeinthelake(partial_result, target_table, target_path, mode="overwrite",):
    from deltalake import write_deltalake

    target_table = DeltaTable(target_path).to_pandas()
    
    write_deltalake(
        target_path,
        partial_result,   
        mode=mode,
        schema_mode = "overwrite",
        partition_by=["Table_Name"],  
    )
    return len(partial_result)
 


def to_datetime_ms_or_parse(series):
    """
    Convert epoch-ms ints/strings or ISO strings to UTC datetimes. Treat 0 as missing.
    Works with Arrow-backed dtypes too.
    """
    s = series
    if pd.api.types.is_datetime64_any_dtype(s):
        return pd.to_datetime(s, errors="coerce", utc=True)

    pa_dtype = getattr(s.dtype, "pyarrow_dtype", None)
    is_arrow_int = pa_dtype is not None and pa.types.is_integer(pa_dtype)
    is_arrow_ts = pa_dtype is not None and pa.types.is_timestamp(pa_dtype)

    if is_arrow_ts:
        return pd.to_datetime(s, errors="coerce", utc=True)

    if is_arrow_int or pd.api.types.is_integer_dtype(s) or pd.api.types.is_float_dtype(s) or s.dtype == object:
        sn = pd.to_numeric(s, errors="coerce")
        sn = sn.where(sn != 0, pd.NA)  # treat 0 as missing
        return pd.to_datetime(sn, unit="ms", errors="coerce", utc=True)

    return pd.to_datetime(s, errors="coerce", utc=True)


# ---------------- Main function ----------------

def compute_cdf_staleness(table_uri, starting_version=0, ending_version = 0, storage_options=None):
   
    dt = DeltaTable(table_uri, storage_options=storage_options or {})
    tables = []
    
    cdf_iter = dt.load_cdf(starting_version=starting_version, ending_version = ending_version)
      
    # No CDF iterator: return empty schema
    if cdf_iter is None:
        print("CDF is null")
        return pd.DataFrame(columns=[
             "Table_Name", "Id", "recid", "versionnumber", "Version_Staleness",
           "commit_version", 
            "modifiedon", "createdon",
            "first_SinkCreatedOn", "last_SinkCreatedOn",
        ])
    
    # Collect Arrow chunks
    for chunk in cdf_iter:
        if isinstance(chunk, pa.RecordBatch):
            tables.append(pa.Table.from_batches([chunk]))
        elif isinstance(chunk, pa.Table):
            tables.append(chunk)
        else:
            raise TypeError(f"Unexpected CDF chunk type: {type(chunk)}")
    
    #For debugging
    #print(f"tables len: {len(tables)}")

    if not tables:
        print("No tables")
        return pd.DataFrame(columns=[
            "Table_Name", "Id", "recid", "versionnumber", "Version_Staleness",
           "commit_version", 
            "modifiedon", "createdon",
            "first_SinkCreatedOn", "last_SinkCreatedOn",
        ])

    # Concat tables (new API first, fallback to old)
    try:
        cdf_tbl = pa.concat_tables(tables, promote_options="default")
    except TypeError:
        cdf_tbl = pa.concat_tables(tables, promote=True)

    df = cdf_tbl.to_pandas(types_mapper=pd.ArrowDtype)

    #For debugging
    #print(f"df len: {len(df)}")

    # Pick/normalize core CDF metadata cols
    def pick_col(opts, default=None):
        for c in opts:
            if c in df.columns:
                return c
        return default

    change_col = pick_col(["_change_type", "change_type"])
    cver_col = pick_col(["_commit_version", "commit_version"])
    cts_col = pick_col(["_commit_timestamp", "commit_timestamp"])
    if not change_col or not cver_col or not cts_col:
        missing = [x for x, v in {
            "change_type/_change_type": change_col,
            "commit_version/_commit_version": cver_col,
            "commit_timestamp/_commit_timestamp": cts_col,
        }.items() if not v]
        raise ValueError(f"CDF columns not found: {missing}. Ensure CDF is enabled/retained.")

    df = df.rename(columns={
        change_col: "change_type",
        cver_col: "commit_version",
        cts_col: "commit_timestamp",
    })

    # Domain columns
    if "Id" not in df.columns and "id" in df.columns:
        df = df.rename(columns={"id": "Id"})
    if "versionnumber" not in df.columns and "VersionNumber" in df.columns:
        df = df.rename(columns={"VersionNumber": "versionnumber"})

    if "recid" not in df.columns:
        for alt in ["recId", "RecId", "RECID"]:
            if alt in df.columns:
                df = df.rename(columns={alt: "recid"})
                break
    if "recid" not in df.columns:
        df["recid"] = 0

    sink_col = pick_col(["SinkCreatedOn", "SyncCreatedOn", "synccreatedon", "sinkcreatedon"])
    if sink_col and sink_col != "SinkCreatedOn":
        df = df.rename(columns={sink_col: "SinkCreatedOn"})
    if "SinkCreatedOn" not in df.columns:
        df["SinkCreatedOn"] = pd.NaT

    if "modifiedon" not in df.columns:
        df["modifiedon"] = pd.NaT

    if "createdon" not in df.columns:
        df["createdon"] = pd.NaT

    # Required
    required = ["Id", "versionnumber", "commit_version",  "recid"]
    miss = [c for c in required if c not in df.columns]
    if miss:
        raise ValueError(f"Missing required columns: {miss}")
        

    # Dtypes & timestamps
    df["commit_version"] = pd.to_numeric(df["commit_version"], errors="coerce")
   

    # Stable order
    df = df.sort_values(["Id", "versionnumber", "commit_version", "commit_timestamp"])

    # Version birth per (Id, versionnumber)
    v_birth = (
        df[df["change_type"].isin(["insert", "update_postimage"])]
        .sort_values(["commit_version", "commit_timestamp"])
        .groupby(["Id", "versionnumber"], as_index=False)
        .head(1)
        .loc[:, ["Id", "versionnumber", "recid", "commit_version", "SinkCreatedOn", "modifieddatetime", "createddatetime"]]
        .rename(columns={
            "SinkCreatedOn": "first_SinkCreatedOn",
            "modifieddatetime": "modifiedon",
            "createddatetime": "createdon",
        })
    )

    #For debugging
  #  print(f"v_birth len: {len(v_birth)}")

    v_last = (
        df[df["change_type"].isin(["insert", "update_postimage"])]
        .sort_values(["commit_version", "commit_timestamp"], ascending=False)
        .groupby(["Id", "versionnumber"], as_index=False)
        .head(1)
        .loc[:, ["Id", "versionnumber", "recid", "SinkCreatedOn"]]
        .rename(columns={
            "SinkCreatedOn": "last_SinkCreatedOn",
        })
    )

    out = (
        v_birth
        .merge(v_last, on=["Id", "versionnumber", "recid"], how="left")
        .loc[:, [
            "Id", "versionnumber", "recid",
            "commit_version", 
            "modifiedon", "createdon",
            "first_SinkCreatedOn", "last_SinkCreatedOn",
        ]]
        .sort_values(["Id", "versionnumber"])
        .reset_index(drop=True)
    )
    

    
    # Enforce datetimes; treat epoch 0 as missing
    date_cols = [
        "modifiedon",
        "createdon",
        "first_SinkCreatedOn",
        "last_SinkCreatedOn",
    ]
    for c in date_cols:
        out[c] = to_datetime_ms_or_parse(out[c])
        out[c] = out[c].where(out[c] != pd.Timestamp(0, tz="UTC"), pd.NaT)

     

    out = out.assign(Version_Staleness = out["first_SinkCreatedOn"] - out["modifiedon"].combine(out["createdon"], func=max) )
    out = out.assign(Table_Name = table_uri.rsplit("/", 1)[-1] )

    
    # Compute timedelta
    
    fs_col = "first_SinkCreatedOn"
    lm_col = "modifiedon"
    

    # Treat epoch-zero (1970-01-01) as missing; build a mask for missing/zero last_modified
   

    zero_dt   = pd.Timestamp(0, tz="UTC")
    sent_1900 = pd.Timestamp("1900-01-01T00:00:00Z")

    mask_missing_last = out[lm_col].isna() | out[lm_col].eq(zero_dt) | out[lm_col].eq(sent_1900)

    # 3) Compute timedelta; force staleness to 0 where last_modified is "null date"
    delta = out[fs_col] - out[lm_col]
    delta = delta.where(~mask_missing_last, pd.Timedelta(0))



    # Build sign and absolute components
    sign = np.where(delta.notna() & (delta < pd.Timedelta(0)), "-", "")
    abs_delta = delta.abs()
    

    # Days, hours, minutes, seconds (hours/minutes/seconds are within the day)
    days = abs_delta.dt.days
    hours = (abs_delta.dt.seconds // 3600)
    minutes = (abs_delta.dt.seconds % 3600) // 60
    seconds = abs_delta.dt.seconds % 60
   


    # Format dd.HH:mm:ss (zero-padded), keep NaN/NaT as <NA>
    out["Version_Staleness"] = np.where(
        delta.notna(),
        sign
        + days.astype("Int64").astype(str).str.zfill(2) + "."
        + hours.astype("Int64").astype(str).str.zfill(2) + ":"
        + minutes.astype("Int64").astype(str).str.zfill(2) + ":"
        + seconds.astype("Int64").astype(str).str.zfill(2),
        pd.NA
    )
    
    #reorder cols
    cols =  [
            "Table_Name", "Id", "recid", "versionnumber", "Version_Staleness",
           "commit_version", 
            "modifiedon", "createdon",
            "first_SinkCreatedOn", "last_SinkCreatedOn",
        ]
    out = out[cols]
    out.sort_values(["Table_Name","Id", "versionnumber"]).reset_index(drop=True)
    return out


# ---------------- Driver  ----------------

allowed_tables = tables_list.split(",")

artifacts_tables_list = notebookutils.lakehouse.listTables(source_lakehouse_name, source_workspace_id)
filtered_tables = [t for t in artifacts_tables_list if t['name'] in allowed_tables]

#filtered_tables = artifacts_tables_list.copy() #all tables
first = True
NumOfLines = 0

target_path = getTargetPath(target_table_name, target_table_base_path )

# Calculate timestamp
target_time = datetime.now() - timedelta(hours=hours_back)
print(f"Process start - target changes after {target_time}")
for table in filtered_tables:
    tablename = table['name']
    location = table['location']
    print("Running " , tablename)
    dt = DeltaTable(location)
    history = dt.history()
   
    #filter History
    #history =[row for row in history if row.get("operation") in {"VacuumEnd", "VacuumStart"}]
    #show history
    #display(history)
    last_version = history[0]["version"]
    print(f"Last version: {last_version}")
    
    matching_versions = []
    #There is no timestamp so we must use the LastVacuumTimestamp
    for entry in history:
        commit_info = entry.get("additionalCommitInfo")
        if not commit_info:
            continue

        # Parse JSON string to dict
        try:
            info_dict = json.loads(commit_info)
            vacuum_ts_str = info_dict.get("LastVacuumTimestamp")
            if vacuum_ts_str:
                # Convert string to datetime
                vacuum_ts = datetime.strptime(vacuum_ts_str, "%m/%d/%Y %I:%M:%S %p")
                if vacuum_ts >= target_time:
                    matching_versions.append((entry["version"], vacuum_ts))
        except Exception as e:
            print(f"Error parsing entry: {e}")
 
    # Sort and get the latest version before the target time
    if matching_versions:
        matching_versions.sort(key=lambda x: x[0], reverse=False)
        start_version = matching_versions[0][0]
        print(f"First version from target_time based on LastVacuumTimestamp: {start_version} {matching_versions[0][1]}")
    else:
        print(f"No matching version found after the target time {target_time}.")
        print("Table not processed: ", tablename)
        continue
  

    print(f"Start version {start_version} - Last Version {last_version}")
    ending_version = last_version
    if ending_version > (start_version + versions_bucket_size):
        ending_version = start_version + versions_bucket_size
    table_completed = False
    while table_completed == False:
        if ending_version >= last_version:
            ending_version = last_version
            table_completed = True
        result = compute_cdf_staleness(table_uri=location, starting_version=start_version, ending_version = ending_version)
        start_version = ending_version + 1
        ending_version = ending_version + versions_bucket_size
 
        if(len(result) == 0): continue

        # Columns you want to show as human-readable strings
        date_cols = [
                "modifiedon", "createdon",
                "first_SinkCreatedOn", "last_SinkCreatedOn",
        ]

        pretty = result.copy()
        
        # Show in UTC (ISO-ish)
        for c in date_cols:
            if c in pretty.columns and pd.api.types.is_datetime64_any_dtype(pretty[c]):
                pretty[c] = pretty[c].dt.strftime("%Y-%m-%d %H:%M:%S%z")
        
        #Write in the table cdf_staleness_table
        NumOfLines += writeinthelake(pretty.copy(), target_table_name, target_path, "append") 
        if display_sample:
            if(first): 
                totalResult = pretty.head(1)
            else: 
                totalResult = pd.concat([totalResult, pretty.head(1)]).reset_index(drop=True)
        first = False
        print(f"Table {tablename} processed up to version {start_version - 1} (completed={table_completed})")

if(first == False):
    dedup_len = deduplicate_lakehouse_table_python(input_path=target_path)
    print(f"{NumOfLines} rows created in cdf_staleness_table. Current tables records: {dedup_len}")
else : print ("No rows creted")

if display_sample and NumOfLines > 0 :
    try:
        display(totalResult)
    except NameError:
        print("totalResult is empty, no rows creted")






Process start - target changes after 2025-10-18 12:11:30.051644
Running  salestable
Last version: 312
First version from target_time based on LastVacuumTimestamp: 312 2025-10-21 09:21:35
Start version 312 - Last Version 312
not tables
Running  salesline
Last version: 371
First version from target_time based on LastVacuumTimestamp: 370 2025-10-21 09:21:33
Start version 370 - Last Version 371
Table salesline processed up to version 371 (completed=True)
Reading DeltaTable files...
Loaded 10 rows.
Deduplicating...
Deduplicated: 5 rows remain (removed 5).
Writing back...
Deduplication complete. Table updated at: abfss://6b887591-99a7-4544-a8c4-80d4a784d7b1@onelake.dfs.fabric.microsoft.com/bca1ccdf-fa53-48c9-ac82-31536fe18832/Tables/cdf_staleness_table
5 rows created in cdf_staleness_table. Current tables records: 5
