In [0]:

s3_bucket_path = "s3://finance-beop-fm/"

files = {
    "orders_raw": "orders_raw.csv",
    "payments_raw": "payments_raw.csv", 
    "expenses_raw": "expenses_raw.csv",
    "budgets_raw": "budgets_raw.csv"
}

print("Listing files in S3 bucket:")
display(dbutils.fs.ls(s3_bucket_path))

bronze_dfs = {}

for table_name, filename in files.items():
    try:
        s3_file_path = f"{s3_bucket_path}{filename}"
        print(f"\n{'-'*65}")
        print(f"Reading {table_name} from: {s3_file_path}")
        print('-'*65)
        
        try:
            df = spark.read \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .option("multiLine", "true") \
                .csv(s3_file_path)
        except Exception as read_error:
            print(f"First read attempt failed: {read_error}")
            print("Trying alternative read method...")
            df = spark.read \
                .option("header", "true") \
                .option("multiLine", "true") \
                .csv(s3_file_path)
        
        from pyspark.sql.functions import current_timestamp, lit
        df = df.withColumn("_ingestion_timestamp", current_timestamp()) \
               .withColumn("_source_file", lit(filename)) \
               .withColumn("_source_path", lit(s3_file_path))
        
        bronze_dfs[table_name] = df
        
        temp_view_name = f"bronze_{table_name}"
        df.createOrReplaceTempView(temp_view_name)
        
        print(f" SUCCESS: {table_name} loaded")
        print(f"   Rows: {df.count():,}")
        print(f"   Columns: {len(df.columns)}")
        print(f"   Temporary View: {temp_view_name}")
        
        print("\n  Sample data:")
        df.show(3, truncate=False)
        
        print("\n  Schema:")
        df.printSchema()
        
    except Exception as e:
        print(f" ERROR loading {table_name}: {str(e)}")
        import traceback
        traceback.print_exc()

print("\n" + "-"*65)
print("BRONZE LAYER SUMMARY")
print("-"*65)
for name, df in bronze_dfs.items():
    print(f"{name}: {df.count():,} rows, {len(df.columns)} columns")

print("\n" + "="*60)
print("AVAILABLE TEMPORARY VIEWS")
print("="*60)
for name in bronze_dfs.keys():
    print(f"bronze_{name}")
    
BRONZE_BASE_PATH = "s3://finance-beop-fm/bronze"

for table_name, df in bronze_dfs.items():
    output_path = f"{BRONZE_BASE_PATH}/{table_name}"
    
    df.write.mode("overwrite").parquet(output_path)
    
    print(f"✅ BRONZE SAVED → {output_path}")

Listing files in S3 bucket:


path,name,size,modificationTime
s3://finance-beop-fm/budgets_raw.csv,budgets_raw.csv,2974,1767414806000
s3://finance-beop-fm/expenses_raw.csv,expenses_raw.csv,4609,1767414806000
s3://finance-beop-fm/orders_raw.csv,orders_raw.csv,249339,1767414808000
s3://finance-beop-fm/payments_raw.csv,payments_raw.csv,359130,1767414809000
s3://finance-beop-fm/bronze/,bronze/,0,1767958534602



-----------------------------------------------------------------
Reading orders_raw from: s3://finance-beop-fm/orders_raw.csv
-----------------------------------------------------------------
 SUCCESS: orders_raw loaded
   Rows: 5,050
   Columns: 9
   Temporary View: bronze_orders_raw

  Sample data:
+--------+----------+-----------+----------------+------------+------------+--------------------------+--------------+-----------------------------------+
|order_id|order_date|customer_id|product_category|order_amount|order_status|_ingestion_timestamp      |_source_file  |_source_path                       |
+--------+----------+-----------+----------------+------------+------------+--------------------------+--------------+-----------------------------------+
|1       |2024-05-09|C8270      |Furniture       |4758.5      |COMPLETED   |2026-01-09 11:35:39.035791|orders_raw.csv|s3://finance-beop-fm/orders_raw.csv|
|2       |2024-01-12|C6734      |Clothing        |864.37      |COMPLETED   |