In [1]:
%pip install semantic-link-labs --quiet

StatementMeta(, 0bdfba23-0cec-47ef-90aa-50fdb43b8841, 7, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [2]:
#imports & config
from pyspark.sql import functions as F
from pyspark.sql import types as T
import json
import pandas as pd
from collections import defaultdict

from sempy_labs.tom import connect_semantic_model

# -----------------------------
# User configuration (edit these)
WORKSPACE_NAME        = "WS_AutoClaimsPOC"
SEMANTIC_MODEL_NAME   = "sm_AutoClaims"
EXCEL_FILE_PATH       = "abfss://WS_AutoClaimsPOC@onelake.dfs.fabric.microsoft.com/lh_AutoClaims.Lakehouse/Files/Data Catalog/sample_data_catalog.xlsx"

# Optional: Filter specific tables (empty list = all tables from Excel)
INCLUDE_TABLES = []  # e.g., ["businessunitreference", "portfolio"]



StatementMeta(, 0bdfba23-0cec-47ef-90aa-50fdb43b8841, 9, Finished, Available, Finished)

In [3]:
# -----------------------------
# Load and process Excel file
print("Loading Excel file...")
df_excel = pd.read_excel(EXCEL_FILE_PATH)
df_excel = df_excel.replace({r'\r\n|\n|\r': ' '}, regex=True)

# Convert to Spark DataFrame
df_spark = spark.createDataFrame(df_excel)
df_spark = df_spark.select("Logical Table Name", "Logical Field Name", "Data Team Definition").dropDuplicates()

print(f"Loaded {df_spark.count()} rows from Excel")

StatementMeta(, 0bdfba23-0cec-47ef-90aa-50fdb43b8841, 10, Finished, Available, Finished)

Loading Excel file...
Loaded 65 rows from Excel


In [4]:
# Group metadata by table and column
metadata_map = {}

rows = df_spark.collect()
for row in rows:
    table_name = row['Logical Table Name']
    field_name = row['Logical Field Name']
    description = row['Data Team Definition']
    
    if table_name is None or field_name is None:
        continue
    
    table_name = str(table_name).strip().lower()
    field_name = str(field_name).strip()
    description = str(description).strip() if description is not None else ""
    
    # Filter by INCLUDE_TABLES if specified
    if INCLUDE_TABLES and table_name not in [t.lower() for t in INCLUDE_TABLES]:
        continue
    
    # Set table description
    if table_name not in metadata_map:
        metadata_map[table_name] = {
            "table_description": f"Table: {table_name}",
            "columns": {}
        }

    # Store column description
    if description:
        metadata_map[table_name]["columns"][field_name] = description

print(f"Processed metadata for {len(metadata_map)} tables from Excel")
print(metadata_map)

StatementMeta(, 0bdfba23-0cec-47ef-90aa-50fdb43b8841, 11, Finished, Available, Finished)

Processed metadata for 8 tables from Excel
{'accident': {'table_description': 'Table: accident', 'columns': {'accident_date': 'Date when the accident occurred', 'vehicle_vin': 'Vehicle Identification Number of the vehicle involved in the accident', 'policyholder_id': 'Foreign key referencing the policyholder involved in the accident', 'accident_id': 'Unique identifier for each accident record', 'severity': 'Severity level of the accident (Low, Medium, High) indicating the extent of damage or injury', 'accident_type': 'Type or category of accident (e.g., Rear-end collision, Side impact, Single vehicle)', 'location': 'City or location where the accident took place'}}, 'adjuster': {'table_description': 'Table: adjuster', 'columns': {'id': 'Unique identifier for each claims adjuster', 'phone': 'Contact phone number for the adjuster', 'name': 'Full name of the claims adjuster', 'email': 'Email address for the adjuster'}}, 'claim': {'table_description': 'Table: claim', 'columns': {'claim_num

In [5]:
updates_applied = []
missing_tables = []
missing_columns = []

# Loop through semantic model tables and columns
with connect_semantic_model(dataset=SEMANTIC_MODEL_NAME, workspace=WORKSPACE_NAME, readonly=False) as tom:
    
    # Get list of semantic model tables for comparison
    sem_model_tables = {t.Name.lower(): t for t in tom.model.Tables}
    
    for table_name, table_data in metadata_map.items():
        
        # Check if table exists in semantic model (case-insensitive)
        if table_name not in sem_model_tables:
            missing_tables.append(table_name)
            print(f"⚠ Table not found in semantic model: {table_name}")
            continue
        
        # Get the actual table object
        table_obj = sem_model_tables[table_name]

        # Update table description
        table_description = table_data["table_description"]
        if table_description:
            table_obj.Description = table_description
            updates_applied.append(("TABLE", table_obj.Name, table_description))        
        
        # Create a map of columns in the semantic model (case-insensitive)
        sem_columns = {c.Name.lower(): c for c in table_obj.Columns}
        
        # Update column descriptions from Excel
        for col_name, col_description in table_data["columns"].items():
            col_name_lower = col_name.lower()
            
            if col_name_lower not in sem_columns:
                missing_columns.append(f"{table_name}.{col_name}")
                continue
            
            # Update column description
            column_obj = sem_columns[col_name_lower]
            if col_description:
                column_obj.Description = col_description
                updates_applied.append(("COLUMN", f"{table_obj.Name}.{column_obj.Name}", col_description))
        
        print(f"✓ Processed table: {table_obj.Name}")

# Summary of updates
tables_with_updates = list(set([
    item[1].split('.')[0]
    for item in updates_applied
    if item[0] == "COLUMN"
]))

columns_updated = [
    item[1]
    for item in updates_applied
    if item[0] == "COLUMN"
]

print("\n" + "=" * 80)
print("UPDATE SUMMARY")
print("=" * 80)
print(f"\nSemantic Model: '{SEMANTIC_MODEL_NAME}'")
print(f"Tables processed: {len(tables_with_updates)}")
print(f"Column descriptions updated: {len(columns_updated)}")

if missing_tables:
    print(f"\n⚠ TABLES NOT FOUND IN SEMANTIC MODEL ({len(missing_tables)}):")
    for tbl in sorted(missing_tables):
        print(f"  - {tbl}")

if missing_columns:
    print(f"\n⚠ COLUMNS NOT FOUND IN SEMANTIC MODEL ({len(missing_columns)}):")
    for col in sorted(missing_columns):
        print(f"  - {col}")

if tables_with_updates:
    print(f"\n✓ Tables updated: {', '.join(sorted(tables_with_updates))}")

print("\n✓ Metadata update completed!")


StatementMeta(, 0bdfba23-0cec-47ef-90aa-50fdb43b8841, 12, Finished, Available, Finished)

✓ Processed table: accident
✓ Processed table: adjuster
✓ Processed table: claim
✓ Processed table: driver_telemetry_data
⚠ Table not found in semantic model: payment
✓ Processed table: policy
✓ Processed table: policyholder
✓ Processed table: vehicle

UPDATE SUMMARY

Semantic Model: 'sm_AutoClaims'
Tables processed: 7
Column descriptions updated: 60

⚠ TABLES NOT FOUND IN SEMANTIC MODEL (1):
  - payment

✓ Tables updated: accident, adjuster, claim, driver_telemetry_data, policy, policyholder, vehicle

✓ Metadata update completed!
