In [0]:
# Create widgets for catalog and schema
dbutils.widgets.text("catalog", "main", "Catalog")
dbutils.widgets.text("schema", "default", "Schema")

# COMMAND ----------

# Get widget values
CATALOG = dbutils.widgets.get("catalog")
SCHEMA = dbutils.widgets.get("schema")

SOURCE_TABLE = "source_table"
TARGET_TABLE = "target_table"

source_fqn = f"{CATALOG}.{SCHEMA}.{SOURCE_TABLE}"
target_fqn = f"{CATALOG}.{SCHEMA}.{TARGET_TABLE}"

print(f"Source: {source_fqn}")
print(f"Target: {target_fqn}")

Source: marcin_demo.default.source_table
Target: marcin_demo.default.target_table


In [0]:
# Clean up if tables exist
spark.sql(f"DROP TABLE IF EXISTS {source_fqn}")
spark.sql(f"DROP TABLE IF EXISTS {target_fqn}")

# Create source table with table and column comments
spark.sql(f"""
    CREATE TABLE {source_fqn} (
        id BIGINT COMMENT 'Unique identifier for each customer record',
        customer_name STRING COMMENT 'Full legal name of the customer',
        email STRING COMMENT 'Primary contact email address',
        created_at TIMESTAMP COMMENT 'Record creation timestamp in UTC',
        revenue DECIMAL(18,2) COMMENT 'Total lifetime revenue in USD',
        status STRING COMMENT 'Account status: active, inactive, or churned'
    )
    COMMENT 'Master customer dimension table containing all customer records with revenue metrics'
""")

print(f"✓ Created source table: {source_fqn}")

✓ Created source table: marcin_demo.default.source_table


In [0]:
# Insert sample data
spark.sql(f"""
    INSERT INTO {source_fqn} VALUES
        (1, 'Acme Corporation', 'contact@acme.com', current_timestamp(), 150000.00, 'active'),
        (2, 'TechStart Inc', 'info@techstart.io', current_timestamp(), 275000.50, 'active'),
        (3, 'Global Dynamics', 'sales@globaldyn.com', current_timestamp(), 89000.00, 'inactive')
""")

display(spark.sql(f"SELECT * FROM {source_fqn}"))

id,customer_name,email,created_at,revenue,status
1,Acme Corporation,contact@acme.com,2025-12-03T05:21:36.022Z,150000.0,active
2,TechStart Inc,info@techstart.io,2025-12-03T05:21:36.022Z,275000.5,active
3,Global Dynamics,sales@globaldyn.com,2025-12-03T05:21:36.022Z,89000.0,inactive


In [0]:
# Verify source table comment exists
display(spark.sql(f"""
    SELECT table_name, comment
    FROM {CATALOG}.information_schema.tables
    WHERE table_schema = '{SCHEMA}' 
      AND table_name = '{SOURCE_TABLE}'
"""))

table_name,comment
source_table,Master customer dimension table containing all customer records with revenue metrics


In [0]:
# Verify source column comments exist
display(spark.sql(f"""
    SELECT column_name, data_type, comment
    FROM {CATALOG}.information_schema.columns
    WHERE table_catalog = '{CATALOG}'
      AND table_schema = '{SCHEMA}'
      AND table_name = '{SOURCE_TABLE}'
    ORDER BY ordinal_position
"""))

column_name,data_type,comment
id,LONG,Unique identifier for each customer record
customer_name,STRING,Full legal name of the customer
email,STRING,Primary contact email address
created_at,TIMESTAMP,Record creation timestamp in UTC
revenue,DECIMAL,Total lifetime revenue in USD
status,STRING,"Account status: active, inactive, or churned"


In [0]:
def clone_table_with_metadata(
    source_table: str, 
    target_table: str,
    catalog: str,
    schema: str,
    replace: bool = False
) -> dict:
    """
    Perform DEEP CLONE and copy all metadata (table and column comments) using information_schema.
    """
    src_table_name = source_table.split('.')[-1]
    
    # Clone the table
    create_keyword = "CREATE OR REPLACE TABLE" if replace else "CREATE TABLE"
    spark.sql(f"{create_keyword} {target_table} DEEP CLONE {source_table}")
    print(f"✓ Cloned: {source_table} → {target_table}")
    
    # Get and copy table comment
    table_comment_df = spark.sql(f"""
        SELECT comment
        FROM {catalog}.information_schema.tables
        WHERE table_schema = '{schema}' 
          AND table_name = '{src_table_name}'
    """)
    
    table_comment = None
    result = table_comment_df.collect()
    if result and result[0]['comment']:
        table_comment = result[0]['comment']
        escaped = table_comment.replace("'", "''")
        spark.sql(f"COMMENT ON TABLE {target_table} IS '{escaped}'")
        print(f"✓ Copied table comment")
    
    # Get and copy column comments
    col_comments_df = spark.sql(f"""
        SELECT column_name, comment
        FROM {catalog}.information_schema.columns
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{src_table_name}'
          AND comment IS NOT NULL
        ORDER BY ordinal_position
    """)
    
    columns = col_comments_df.collect()
    column_count = 0
    
    for row in columns:
        col_name = row['column_name']
        col_comment = row['comment'].replace("'", "''")
        spark.sql(f"ALTER TABLE {target_table} ALTER COLUMN `{col_name}` COMMENT '{col_comment}'")
        column_count += 1
    
    if column_count > 0:
        print(f"✓ Copied {column_count} column comments")
    
    return {
        "source": source_table,
        "target": target_table,
        "table_comment": table_comment,
        "column_comments_copied": column_count
    }

In [0]:
result = clone_table_with_metadata(
    source_table=source_fqn,
    target_table=target_fqn,
    catalog=CATALOG,
    schema=SCHEMA,
    replace=True
)

print(f"\nResult: {result}")

✓ Cloned: marcin_demo.default.source_table → marcin_demo.default.target_table
✓ Copied table comment
✓ Copied 6 column comments

Result: {'source': 'marcin_demo.default.source_table', 'target': 'marcin_demo.default.target_table', 'table_comment': 'Master customer dimension table containing all customer records with revenue metrics', 'column_comments_copied': 6}


In [0]:
# Verify target table comment
display(spark.sql(f"""
    SELECT table_name, comment
    FROM {CATALOG}.information_schema.tables
    WHERE table_schema = '{SCHEMA}'
      AND table_name IN ('{SOURCE_TABLE}', '{TARGET_TABLE}')
    ORDER BY table_name
"""))

table_name,comment
source_table,Master customer dimension table containing all customer records with revenue metrics
target_table,Master customer dimension table containing all customer records with revenue metrics


In [0]:
# Verify target column comments
display(spark.sql(f"""
    SELECT column_name, comment
    FROM {CATALOG}.information_schema.columns
    WHERE table_catalog = '{CATALOG}'
      AND table_schema = '{SCHEMA}'
      AND table_name = '{TARGET_TABLE}'
    ORDER BY ordinal_position
"""))

column_name,comment
id,Unique identifier for each customer record
customer_name,Full legal name of the customer
email,Primary contact email address
created_at,Record creation timestamp in UTC
revenue,Total lifetime revenue in USD
status,"Account status: active, inactive, or churned"


In [0]:
display(spark.sql(f"DESCRIBE TABLE EXTENDED {target_fqn}"))

col_name,data_type,comment
id,bigint,Unique identifier for each customer record
customer_name,string,Full legal name of the customer
email,string,Primary contact email address
created_at,timestamp,Record creation timestamp in UTC
revenue,"decimal(18,2)",Total lifetime revenue in USD
status,string,"Account status: active, inactive, or churned"
,,
# Delta Statistics Columns,,
Column Names,"customer_name, email, id, status, revenue, created_at",
Column Selection Method,first-32,
