Generate Create Table Statements for specified tables in the lakehouse

In [None]:
# Step 1
# Configuration and Setup
from pyspark.sql import functions as F
import os
from datetime import datetime

# Configuration - UPDATE THESE VALUES
TARGET_SCHEMA_NAME = "retail_simple"  # Change this to your desired schema name
ALL_TABLES = False  # Set to True to process ALL tables, False to use SPECIFIED_TABLES

# Specify the table names you want to generate CREATE statements for (only used if ALL_TABLES = False)
SPECIFIED_TABLES = [
    "Ledger",
    "LedgerAccount", 
    "LedgerAccountCategory",
    "LedgerAccountType",
    "LedgerType"
]

print(f"🔍 Generating CREATE TABLE statements for lakehouse tables")
print(f"🎯 Target schema name: {TARGET_SCHEMA_NAME}")
print(f"🔧 Mode: {'ALL TABLES' if ALL_TABLES else f'{len(SPECIFIED_TABLES)} specified tables'}")
print(f"🕐 Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("-" * 70)

In [None]:
#  Step 2
#  Get tables and generate CREATE statements
try:
    # Get all tables
    tables_df = spark.sql("SHOW TABLES")
    all_tables = [row['tableName'] for row in tables_df.collect()]
    
    # Determine which tables to process
    if ALL_TABLES:
        tables_to_process = all_tables
        print(f"📋 Processing ALL {len(all_tables)} tables in lakehouse")
    else:
        tables_to_process = [table for table in SPECIFIED_TABLES if table in all_tables]
        missing_tables = [table for table in SPECIFIED_TABLES if table not in all_tables]
        print(f"📋 Found {len(all_tables)} total tables in lakehouse")
        print(f"✅ Found {len(tables_to_process)} specified tables: {', '.join(tables_to_process)}")
        if missing_tables:
            print(f"⚠️  Missing tables: {', '.join(missing_tables)}")
    
    print("-" * 70)
    
    # Generate CREATE TABLE statements
    if tables_to_process:
        output_content = []
        
        # Add header
        output_content.append("# Generated CREATE TABLE Statements")
        output_content.append(f"# Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        output_content.append(f"# Target schema: {TARGET_SCHEMA_NAME}")
        output_content.append(f"# Mode: {'ALL TABLES' if ALL_TABLES else 'SPECIFIED TABLES'}")
        output_content.append("")
        output_content.append("# Configuration")
        output_content.append(f'SCHEMA_NAME = "{TARGET_SCHEMA_NAME}"')
        output_content.append('spark.sql(f"CREATE DATABASE IF NOT EXISTS {SCHEMA_NAME}")')
        output_content.append('print(f"✅ {SCHEMA_NAME} schema ready!")')
        output_content.append("")
        
        successful_tables = 0
        
        for i, table_name in enumerate(tables_to_process, 1):
            try:
                print(f"🔄 Processing table {i}/{len(tables_to_process)}: {table_name}")
                
                # Get table schema
                describe_df = spark.sql(f"DESCRIBE {table_name}")
                columns = describe_df.collect()
                valid_columns = [col for col in columns 
                               if not col['col_name'].startswith('#') and col['col_name'].strip() != '']
                
                if valid_columns:
                    # Generate CREATE TABLE statement
                    output_content.append(f"# {i}. Create {table_name} table")
                    output_content.append('create_table_sql = f"""')
                    output_content.append(f'CREATE TABLE IF NOT EXISTS {{SCHEMA_NAME}}.{table_name} (')
                    
                    # Add column definitions
                    for j, col in enumerate(valid_columns):
                        col_name = col['col_name']
                        data_type = col['data_type'].upper()
                        
                        # Add comma for all but last column
                        comma = "," if j < len(valid_columns) - 1 else ""
                        output_content.append(f'    {col_name} {data_type}{comma}')
                    
                    output_content.append(')')
                    output_content.append('USING DELTA')
                    output_content.append('"""')
                    output_content.append('spark.sql(create_table_sql)')
                    output_content.append(f'print(f"✅ {{SCHEMA_NAME}}.{table_name} table created!")')
                    output_content.append("")
                    
                    successful_tables += 1
                    
            except Exception as e:
                print(f"❌ Error processing table {table_name}: {str(e)}")
        
        print(f"\n📊 Successfully processed {successful_tables}/{len(tables_to_process)} tables")
        
    else:
        output_content = ["# No tables found to process"]
        print("⚠️  No tables found to process")
        
except Exception as e:
    print(f"❌ Error: {str(e)}")
    output_content = []

In [None]:
# Step 3
# Display generated CREATE TABLE statements
if 'output_content' in locals() and output_content:
    full_content = "\n".join(output_content)
    print(full_content)