In [8]:
# Step 0: Setup
from pyspark.sql.functions import coalesce, lit, col, when, trim
from datetime import datetime

# Timestamped version string
now = datetime.now()
version_str = now.strftime('%Y%m%d_%H%M')  # e.g. '20250822_0841'
timestamp_str = now.strftime('%Y-%m-%d %H:%M:%S')

# Define tables to track# 
# ✅ Manual list for testing
tables = ["account", "salestable", "salesline"]

# 🔄 Uncomment this line to use all managed tables in the Lakehouse
# tables = [table.name for table in spark.catalog.listTables() if table.tableType == 'MANAGED']


# Step 1: Capture current schema snapshot

all_schemas = []

for table_name in tables:
    try:
        df = spark.read.table(table_name)
        schema_info = [(table_name, field.name, field.dataType.simpleString(), field.nullable) for field in df.schema.fields]
        all_schemas.extend(schema_info)
    except Exception as e:
        print(f"⚠️ Skipping {table_name}: {e}")

schema_df = spark.createDataFrame(all_schemas, ["table_name", "column_name", "data_type", "is_nullable"])
schema_df = schema_df.withColumn("snapshot_version", lit(version_str))
schema_df = schema_df.withColumn("snapshot_timestamp", lit(timestamp_str))


# Step 2: Save snapshot to lakehouse
schema_df.write.mode("append").saveAsTable("lakehouse_schema_snapshots")
print(f"✅ Snapshot saved with version {version_str} at {timestamp_str}")


# Get distinct versions sorted descending
versions_df = spark.sql("""
    SELECT DISTINCT snapshot_version 
    FROM lakehouse_schema_snapshots
    ORDER BY snapshot_version DESC
""")

# Collect top two versions
versions = [row["snapshot_version"] for row in versions_df.limit(2).collect()]

if len(versions) < 2:
    raise ValueError("❌ Not enough versions to compare. Need at least two snapshots.")

version_b = versions[0]  # Most recent
version_a = versions[1]  # Previous

print(f"✅ Most recent snapshot version {version_b} previous snapshot version {version_a}")


# Step 3: Load two versions for comparison

# version_a = '20250822_0830'  # Replace with earlier version
# version_b = version_str      # Current version

df_old = spark.sql(f"""
    SELECT table_name, column_name, data_type, is_nullable 
    FROM lakehouse_schema_snapshots 
    WHERE snapshot_version = '{version_a}'
""")

df_new = spark.sql(f"""
    SELECT table_name, column_name, data_type, is_nullable 
    FROM lakehouse_schema_snapshots 
    WHERE snapshot_version = '{version_b}'
""")


# Step 4: Detect drift via full outer join

drift_df = df_new.alias("new").join(
    df_old.alias("old"),
    on=["table_name", "column_name"],
    how="full_outer"
).select(
    coalesce(col("new.table_name"), col("old.table_name")).alias("table_name"),
    coalesce(col("new.column_name"), col("old.column_name")).alias("column_name"),
    col("old.data_type").alias("old_type"),
    col("new.data_type").alias("new_type"),
    col("old.is_nullable").alias("old_nullable"),
    col("new.is_nullable").alias("new_nullable")
).withColumn("change_type", when(col("old_type").isNull(), "added")
    .when(col("new_type").isNull(), "removed")
    .when(trim(col("old_type")) != trim(col("new_type")), "type_changed")
    .when(trim(col("old_nullable")) != trim(col("new_nullable")), "nullability_changed")
    .otherwise("unchanged"))


# Step 5: Display or Save drift report
drift_df.write.mode("overwrite").saveAsTable("lakehouse_schema_drift_log")

drift_df_filtered = drift_df.filter(col("change_type") != "unchanged")

summary_df = drift_df_filtered.groupBy("table_name", "change_type").count().orderBy("table_name")
summary_df.show(truncate=False)

drift_df_filtered.show(truncate=False)

drift_df.show(truncate=False)

StatementMeta(, e003e68b-0898-478a-b5bc-25281ef3378f, 10, Finished, Available, Finished)

✅ Snapshot saved with version 20250902_1826 at 2025-09-02 18:26:53
✅ Most recent snapshot version 20250902_1826 previous snapshot version 20250902_1819
+----------+-----------+-----+
|table_name|change_type|count|
+----------+-----------+-----+
+----------+-----------+-----+

+----------+-----------+--------+--------+------------+------------+-----------+
|table_name|column_name|old_type|new_type|old_nullable|new_nullable|change_type|
+----------+-----------+--------+--------+------------+------------+-----------+
+----------+-----------+--------+--------+------------+------------+-----------+

+----------+------------------------------+---------+---------+------------+------------+-----------+
|table_name|column_name                   |old_type |new_type |old_nullable|new_nullable|change_type|
+----------+------------------------------+---------+---------+------------+------------+-----------+
|account   |Id                            |string   |string   |true        |true        |unc