In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower
from collections import defaultdict


StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 3, Finished, Available, Finished)

In [2]:
# Enter list of tables for which metadata will be updated
INCLUDE_TABLES         = [] # ["table1","table2"]
#Enter lakehouse path to data catalog excel file 
CatalogFilePath = "abfss://WS_AutoClaimsPOC@onelake.dfs.fabric.microsoft.com/lh_AutoClaims.Lakehouse/Files/Data Catalog/sample_data_catalog.xlsx"

# Convert to lowercase for case-insensitive comparison
include_tables_lower = [t.lower() for t in INCLUDE_TABLES]

StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 4, Finished, Available, Finished)

In [3]:
# code to ignore new line characters within the excel cells
import pandas as pd

df = pd.read_excel(CatalogFilePath)
df_pandas = df.replace({r'\r\n|\n|\r': ' '}, regex=True)

#display(df_pandas)

StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 5, Finished, Available, Finished)

In [4]:
#Convert to spark dataframe
df = spark.createDataFrame(df_pandas)

# Keep only required columns and remove duplicates
df = df.select("Logical Table Name", "Logical Field Name", "Database Datatype", "Data Team Definition").dropDuplicates()
#display(df)

StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 6, Finished, Available, Finished)

In [5]:
# Group by Logical Table Name
tables = defaultdict(list)

# Collect data to driver for processing
rows = df.select("Logical Table Name", "Logical Field Name", "Database Datatype", "Data Team Definition").collect()

for row in rows:
    table_name = row['Logical Table Name']
    field_name = row['Logical Field Name']
    datatype = row['Database Datatype']
    description = row['Data Team Definition']
    
    # Skip rows with missing critical data
    if table_name is None or field_name is None or datatype is None:
        print(f"⚠ Table: {table_name} has one or more content missing")
        continue

    # Clean the values
    #table_name = str(table_name).strip()
    table_name = str(table_name).strip().lower()
    field_name = str(field_name).strip()
    datatype = str(datatype).strip()
    description = str(description).strip() if description is not None else ""
    
    # Skip tables not in the INLCUDE list
    if include_tables_lower and table_name not in include_tables_lower:
        continue 

    # Map SQL Server datatypes to Spark SQL datatypes
    datatype_mapping = {
        'nvarchar': 'STRING',
        'varchar': 'STRING',
        'nChar': 'STRING',
        'nchar': 'STRING',
        'char': 'STRING',
        'integer': 'INT',
        'int': 'INT',
        'decimal': 'DECIMAL(38,10)',
        'numeric': 'DECIMAL(38,10)',
        'date': 'DATE',
        'datetime': 'TIMESTAMP',
        'tinyint': 'TINYINT',
        'smallint': 'SMALLINT',
        'bigint': 'BIGINT',
        'float': 'DOUBLE',
        'bit': 'BOOLEAN'
    }
    
    # Parse datatype with precision/scale
    spark_datatype = datatype
    if '(' in datatype:
        base_type = datatype.split('(')[0].strip().lower()
        if base_type in ['nvarchar', 'varchar', 'nchar', 'char']:
            spark_datatype = 'STRING'
        elif base_type in ['decimal', 'numeric']:
            spark_datatype = datatype.upper().replace('NVARCHAR', 'DECIMAL').replace('VARCHAR', 'DECIMAL')
        else:
            spark_datatype = datatype_mapping.get(base_type, 'STRING')
    else:
        base_type = datatype.lower()
        spark_datatype = datatype_mapping.get(base_type, 'STRING')
    
    tables[table_name].append({
        'field_name': field_name,
        'datatype': spark_datatype,
        'description': description
    })

print(f"Filtered to {len(tables)} tables from INCLUDE list")

StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 7, Finished, Available, Finished)

Filtered to 8 tables from INCLUDE list


In [6]:
# Discover tables in database
existing_tables = [row.tableName.lower() for row in spark.sql("SHOW TABLES").collect()]
print(f"Found {len(existing_tables)} existing tables in lakehouse\n")

# Track missing and updated items
missing_tables = []
missing_columns = {}
updated_tables = []
updated_columns = []

print(f"Processing {len(tables)} tables from Excel...\n")

for table_name, columns in sorted(tables.items()):
    # Check if table exists
    if table_name not in existing_tables:
        missing_tables.append(table_name)
        print(f"⚠ Table not found: {table_name}")
        continue
    
    try:
        # Get existing columns in the table
        existing_columns = {row.col_name.lower(): row.col_name for row in spark.sql(f"DESCRIBE TABLE {table_name}").collect()}
        
        # Update table-level description
        table_comment = f"Table: {table_name}"
        try:
            spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('comment' = '{table_comment}')")
            updated_tables.append(table_name)
            print(f"✓ Updated table description: {table_name}")
        except Exception as e:
            print(f"  ✗ Failed to update table description for {table_name}: {str(e)}")
        
        # Update column descriptions
        table_missing_columns = []
        for col in columns:
            col_name_lower = col['field_name'].lower()
            
            # Check if column exists (case-insensitive)
            if col_name_lower not in existing_columns:
                table_missing_columns.append(col['field_name'])
                continue
            
            # Use the actual column name from the table
            actual_col_name = existing_columns[col_name_lower]
            col_desc = col['description'].replace("'", "\\'")
            
            if col_desc:
                try:
                    spark.sql(f"ALTER TABLE {table_name} ALTER COLUMN {actual_col_name} COMMENT '{col_desc}'")
                    updated_columns.append(f"{table_name}.{actual_col_name}")
                except Exception as e:
                    print(f"  ✗ Failed to update column {table_name}.{actual_col_name}: {str(e)}")
        
        if table_missing_columns:
            missing_columns[table_name] = table_missing_columns
            print(f"  ⚠ Missing columns in {table_name}: {', '.join(table_missing_columns)}")
        
        if len(columns) - len(table_missing_columns) > 0:
            print(f"  ✓ Updated {len(columns) - len(table_missing_columns)} column descriptions")
        
    except Exception as e:
        print(f"  ✗ Error processing table {table_name}: {str(e)}")
    
    print()


StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 8, Finished, Available, Finished)

Found 7 existing tables in lakehouse

Processing 8 tables from Excel...

✓ Updated table description: accident
  ✓ Updated 7 column descriptions

✓ Updated table description: adjuster
  ✓ Updated 4 column descriptions

✓ Updated table description: claim
  ✓ Updated 8 column descriptions

✓ Updated table description: driver_telemetry_data
  ✓ Updated 22 column descriptions

⚠ Table not found: payment
✓ Updated table description: policy
  ✓ Updated 7 column descriptions

✓ Updated table description: policyholder
  ✓ Updated 8 column descriptions

✓ Updated table description: vehicle
  ✓ Updated 4 column descriptions



In [7]:
# Print summary
print("=" * 80)
print("EXECUTION SUMMARY")
print("=" * 80)
print(f"\nTotal tables in Excel: {len(tables)}")
print(f"Tables updated: {len(updated_tables)}")
print(f"Column descriptions updated: {len(updated_columns)}")

if missing_tables:
    print(f"\n⚠ TABLES NOT FOUND IN LAKEHOUSE ({len(missing_tables)}):")
    for table_name in sorted(missing_tables):
        print(f"  - {table_name}")

if missing_columns:
    print(f"\n⚠ COLUMNS NOT FOUND IN TABLES ({sum(len(cols) for cols in missing_columns.values())} total):")
    for table_name in sorted(missing_columns.keys()):
        print(f"  Table: {table_name}")
        for col_name in missing_columns[table_name]:
            print(f"    - {col_name}")

if not missing_tables and not missing_columns:
    print("\n✓ All tables and columns from Excel or in the INLCUDE list were found and updated!")
else:
    print(f"\n⚠ Please review missing tables/columns listed above")

print("\n✓ Description update completed!")



StatementMeta(, e9d3e66a-6ab4-409c-a626-97a84874faa7, 9, Finished, Available, Finished)

EXECUTION SUMMARY

Total tables in Excel: 8
Tables updated: 7
Column descriptions updated: 60

⚠ TABLES NOT FOUND IN LAKEHOUSE (1):
  - payment

⚠ Please review missing tables/columns listed above

✓ Description update completed!
