In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower
from collections import defaultdict


StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 3, Finished, Available, Finished)

In [2]:
#Enter lakehouse path to data catalog excel file 
CatalogFilePath = "abfss://WS_AutoClaimsPOC@onelake.dfs.fabric.microsoft.com/lh_AutoClaims.Lakehouse/Files/Data Catalog/sample_data_catalog.xlsx"

StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 4, Finished, Available, Finished)

In [3]:
# code to ignore new line characters within the excel cells
import pandas as pd

df = pd.read_excel(CatalogFilePath)
df_pandas = df.replace({r'\r\n|\n|\r': ' '}, regex=True)

#display(df_pandas)

StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 5, Finished, Available, Finished)

In [4]:
#Convert to spark dataframe
df = spark.createDataFrame(df_pandas)

# Keep only required columns and remove duplicates
df = df.select("Logical Table Name", "Logical Field Name", "Database Datatype", "Data Team Definition").dropDuplicates()
#display(df)

StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 6, Finished, Available, Finished)

In [5]:
# Group by Logical Table Name
tables = defaultdict(list)

# Collect data to driver for processing
rows = df.select("Logical Table Name", "Logical Field Name", "Database Datatype", "Data Team Definition").collect()

for row in rows:
    table_name = row['Logical Table Name']
    field_name = row['Logical Field Name']
    datatype = row['Database Datatype']
    description = row['Data Team Definition']
    
    # Skip rows with missing critical data
    if table_name is None or field_name is None or datatype is None:
        continue
    
    # Clean the values
    #table_name = str(table_name).strip()
    table_name = str(table_name).strip().lower()
    field_name = str(field_name).strip()
    datatype = str(datatype).strip()
    description = str(description).strip() if description is not None else ""
    
    # Map SQL Server datatypes to Spark SQL datatypes
    datatype_mapping = {
        'nvarchar': 'STRING',
        'varchar': 'STRING',
        'nChar': 'STRING',
        'nchar': 'STRING',
        'char': 'STRING',
        'integer': 'INT',
        'int': 'INT',
        'decimal': 'DECIMAL(38,10)',
        'numeric': 'DECIMAL(38,10)',
        'date': 'DATE',
        'datetime': 'TIMESTAMP',
        'tinyint': 'TINYINT',
        'smallint': 'SMALLINT',
        'bigint': 'BIGINT',
        'float': 'DOUBLE',
        'bit': 'BOOLEAN'
    }
    
    # Parse datatype with precision/scale
    spark_datatype = datatype
    if '(' in datatype:
        base_type = datatype.split('(')[0].strip().lower()
        if base_type in ['nvarchar', 'varchar', 'nchar', 'char']:
            spark_datatype = 'STRING'
        elif base_type in ['decimal', 'numeric']:
            spark_datatype = datatype.upper().replace('NVARCHAR', 'DECIMAL').replace('VARCHAR', 'DECIMAL')
        else:
            spark_datatype = datatype_mapping.get(base_type, 'STRING')
    else:
        base_type = datatype.lower()
        spark_datatype = datatype_mapping.get(base_type, 'STRING')
    
    tables[table_name].append({
        'field_name': field_name,
        'datatype': spark_datatype,
        'description': description
    })

StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 7, Finished, Available, Finished)

In [6]:
# Execute CREATE TABLE scripts directly in Fabric lakehouse
print(f"Starting to create {len(tables)} Delta tables in Fabric lakehouse...\n")

created_tables = []
failed_tables = []

for table_name, columns in sorted(tables.items()):
    try:
        # Build CREATE TABLE statement
        create_stmt = f"CREATE TABLE IF NOT EXISTS {table_name} (\n"
        
        # Write column definitions
        col_defs = []
        for col in columns:
            col_name = col['field_name']
            col_type = col['datatype']
            #col_desc = col['description'].replace("'", "''")[:500]  # Escape quotes and limit length
            # Escape single quotes
            col_desc = col['description'].replace("'", "\\'")

            
            col_def = f"    {col_name} {col_type}"
            if col_desc:
                col_def += f" COMMENT '{col_desc}'"
            col_defs.append(col_def)
        
        create_stmt += ',\n'.join(col_defs)
        create_stmt += "\n)\n"
        create_stmt += "USING DELTA\n"
        create_stmt += f"COMMENT 'Table: {table_name}'"
        
        # Execute the CREATE TABLE statement
        print(f"Creating table: {table_name} ({len(columns)} columns)...")
        spark.sql(create_stmt)
        created_tables.append(table_name)
        print(f"  ✓ Successfully created {table_name}\n")
        
    except Exception as e:
        failed_tables.append((table_name, str(e)))
        print(f"  ✗ Failed to create {table_name}")
        print(f"    Error: {str(e)}\n")




StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 8, Finished, Available, Finished)

Starting to create 8 Delta tables in Fabric lakehouse...

Creating table: accident (7 columns)...
  ✓ Successfully created accident

Creating table: adjuster (4 columns)...
  ✓ Successfully created adjuster

Creating table: claim (8 columns)...
  ✓ Successfully created claim

Creating table: driver_telemetry_data (22 columns)...
  ✓ Successfully created driver_telemetry_data

Creating table: payment (5 columns)...
  ✓ Successfully created payment

Creating table: policy (7 columns)...
  ✓ Successfully created policy

Creating table: policyholder (8 columns)...
  ✓ Successfully created policyholder

Creating table: vehicle (4 columns)...
  ✓ Successfully created vehicle



In [7]:
# Print summary
print("=" * 80)
print("EXECUTION SUMMARY")
print("=" * 80)
print(f"\nTotal tables processed: {len(tables)}")
print(f"Successfully created: {len(created_tables)}")
print(f"Failed: {len(failed_tables)}")

if failed_tables:
    print("\nFailed tables:")
    for table_name, error in failed_tables:
        print(f"  - {table_name}: {error}")

print("\n✓ Delta table creation completed!")


StatementMeta(, fbc530e5-000a-40ef-bb71-208ca41550fe, 9, Finished, Available, Finished)

EXECUTION SUMMARY

Total tables processed: 8
Successfully created: 8
Failed: 0

✓ Delta table creation completed!
