# Copy XLSX Files to Lakehouse Tables 

## Step 1: Environment Setup

In [None]:
# Step 1
# Import required libraries
import pandas as pd
from datetime import datetime
from pyspark.sql.functions import lit, current_timestamp
from pyspark.sql.types import StringType

# Configuration
SOURCE_FILES_PATH = "Files/SalesLT_Raw/"  # Folder containing Excel files
TARGET_SCHEMA = "SalesLT"                 # Target schema for tables
LOAD_TIMESTAMP = datetime.now().isoformat()
LOAD_DATE = datetime.now().strftime("%Y-%m-%d")  # Add missing LOAD_DATE

# Expected SalesLT tables (matching Excel file names)
EXPECTED_TABLES = [
    'address', 'customer', 'customeraddress', 'product', 
    'productcategory', 'productdescription', 'productmodel',
    'productmodelproductdescription', 'salesorderdetail', 'salesorderheader'
]

print("🚀 Excel Files to SalesLT Schema Tables Pipeline")
print("=" * 60)
print(f"✅ Libraries imported")
print(f"📅 Load Timestamp {LOAD_TIMESTAMP}")
print(f"📅 Load Date: {LOAD_DATE}")  # Show both for clarity
print(f"📁 Source files path: {SOURCE_FILES_PATH}")
print(f"🎯 Target schema: {TARGET_SCHEMA}")
print(f"📋 Expected tables: {len(EXPECTED_TABLES)}")
print(f"✅ Microsoft Fabric PySpark environment ready")

## Step 2: Discover Available Tables

In [None]:
# Step 2
# Discover available Excel files
print("🔍 DISCOVERING EXCEL FILES")
print("=" * 50)

try:
    from notebookutils import mssparkutils
    
    print(f"📁 Target directory: {SOURCE_FILES_PATH}")
    
    # Direct approach - try the configured path
    file_list = mssparkutils.fs.ls(SOURCE_FILES_PATH)
    excel_files = [f for f in file_list if f.name.endswith('.xlsx')]
    
    print(f"✅ Found {len(excel_files)} Excel files in {SOURCE_FILES_PATH}")
    
    if len(excel_files) > 0:
        print(f"\n📋 Excel files found:")
        
        files_to_process = []
        for file_info in excel_files:
            file_name = file_info.name
            table_name = file_name.replace('.xlsx', '').lower()
            
            is_expected = table_name in [t.lower() for t in EXPECTED_TABLES]
            marker = "🎯" if is_expected else "📋"
            print(f"   {marker} {file_name} → {table_name}")
            
            if is_expected:
                files_to_process.append({
                    'file_name': file_name,
                    'file_path': file_info.path,
                    'table_name': table_name
                })
        
        FILES_TO_PROCESS = files_to_process
        print(f"\n🎉 Ready to process {len(files_to_process)} matching files!")
        
        if len(files_to_process) < len(excel_files):
            print(f"⚠️ {len(excel_files) - len(files_to_process)} files don't match expected names")
    else:
        FILES_TO_PROCESS = []
        print(f"\n❌ No Excel files found in {SOURCE_FILES_PATH}")
        
except Exception as e:
    print(f"❌ Error accessing {SOURCE_FILES_PATH}: {str(e)}")
    print("\n🔧 DIAGNOSTIC: Let's check what's available...")
    
    # Simple diagnostic if the main path fails
    try:
        print(f"📁 Checking Files/ directory:")
        files_root = mssparkutils.fs.ls("Files/")
        for item in files_root:
            item_type = "📁" if item.isDir else "📄"
            print(f"   {item_type} {item.name}")
    except Exception as diag_error:
        print(f"❌ Cannot access Files/ directory: {str(diag_error)}")
    
    FILES_TO_PROCESS = []

# STEP 3: COPY Fils to Tables

In [None]:
# Step 3
# Copy Excel files to SalesLT schema tables
print("🚀 PROCESSING EXCEL FILES TO SALESLT TABLES")
print("=" * 60)

if 'FILES_TO_PROCESS' not in locals() or len(FILES_TO_PROCESS) == 0:
    print("❌ No files to process. Run previous steps first.")
else:
    print(f"📋 Processing {len(FILES_TO_PROCESS)} Excel files")
    print(f"📁 Source: {SOURCE_FILES_PATH}")
    print(f"🎯 Target schema: {TARGET_SCHEMA}")
    print(f"📅 Load date: {LOAD_DATE}")
    print()
    
    # Processing results tracking
    results = []
    total_rows_processed = 0
    
    for i, file_info in enumerate(FILES_TO_PROCESS, 1):
        file_name = file_info['file_name']
        file_path = file_info['file_path']
        table_name = file_info['table_name']
        
        print(f"[{i}/{len(FILES_TO_PROCESS)}] Processing {file_name}...")
        
        try:
            # Read Excel file using Spark directly instead of pandas path conversion
            print(f"   📖 Reading Excel file: {file_name}")
            
            # Use Spark to read the Excel file directly
            # First, let's try using the Fabric file path directly
            spark_file_path = f"/lakehouse/default/Files/SalesLT_Raw/{file_name}"
            
            # Read Excel using pandas with the correct Fabric path
            pandas_df = pd.read_excel(spark_file_path)
            row_count = len(pandas_df)
            
            print(f"   ✅ Excel data loaded: {row_count:,} rows, {len(pandas_df.columns)} columns")
            
            # Convert pandas DataFrame to Spark DataFrame
            print(f"   🔄 Converting to Spark DataFrame...")
            spark_df = spark.createDataFrame(pandas_df)
            
            # Add metadata columns
            print(f"   🏷️ Adding metadata columns...")
            enriched_df = spark_df \
                .withColumn("_load_date", lit(LOAD_DATE)) \
                .withColumn("_load_timestamp", lit(LOAD_TIMESTAMP)) \
                .withColumn("_source_file", lit(file_name)) \
                .withColumn("_source_path", lit(file_path)) \
                .withColumn("_processing_timestamp", current_timestamp()) \
                .withColumn("_load_method", lit("excel_file_import")) \
                .withColumn("_record_source", lit("sample_data_files"))
            
            # Create/replace table in SalesLT schema
            target_table = f"{TARGET_SCHEMA}.{table_name}"
            print(f"   🏢 Creating table: {target_table}")
            
            enriched_df.write \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(target_table)
            
            print(f"   ✅ Table created successfully: {row_count:,} rows")
            
            # Success tracking
            total_rows_processed += row_count
            results.append({
                "file": file_name,
                "table": target_table,
                "rows": row_count,
                "columns": len(pandas_df.columns),
                "status": "success"
            })
            
            print(f"   🎉 Successfully processed {row_count:,} rows")
            
        except Exception as e:
            error_msg = str(e)[:100]
            results.append({
                "file": file_name,
                "table": f"{TARGET_SCHEMA}.{table_name}",
                "rows": 0,
                "columns": 0,
                "status": "failed",
                "error": error_msg
            })
            print(f"   ❌ Failed: {error_msg}...")
        
        print()
    
    # Processing summary
    successful = [r for r in results if r["status"] == "success"]
    failed = [r for r in results if r["status"] == "failed"]
    
    print("🎉 PROCESSING SUMMARY")
    print("=" * 60)
    print(f"✅ Successfully processed: {len(successful)} files")
    print(f"❌ Failed processing: {len(failed)} files")
    print(f"📊 Total rows processed: {total_rows_processed:,}")
    print(f"📅 Processing completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    if successful:
        print(f"\n🏢 Created SalesLT tables:")
        for result in successful:
            print(f"✅ {result['table']}: {result['rows']:,} rows, {result['columns']} columns")
    
    if failed:
        print(f"\n⚠️ Processing failures:")
        for result in failed:
            print(f"❌ {result['file']} → {result['table']}: {result.get('error', 'Unknown error')}")
    
    print(f"\n🎯 SalesLT schema tables ready!")
    print(f"💡 Tables can be queried using: SELECT * FROM {TARGET_SCHEMA}.[tablename]")
    
    # Quick verification - check if tables are accessible
    print(f"\n🔍 QUICK VERIFICATION:")
    try:
        # Try to query one of the created tables
        if successful:
            test_table = successful[0]['table']
            test_df = spark.sql(f"SELECT COUNT(*) as row_count FROM {test_table}")
            test_count = test_df.collect()[0]['row_count']
            print(f"✅ Verified: {test_table} contains {test_count:,} rows")
            
            # Show all tables in SalesLT schema
            print(f"\n📋 Tables in {TARGET_SCHEMA} schema:")
            schema_tables = spark.sql(f"SHOW TABLES IN {TARGET_SCHEMA}").collect()
            for table_row in schema_tables:
                print(f"   🏢 {table_row.tableName}")
        else:
            print("❌ No successful tables to verify")
            
    except Exception as verify_error:
        print(f"⚠️ Verification failed: {str(verify_error)[:80]}...")
        print("💡 Tables might be in default schema - try: SHOW TABLES")