In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 01: Bronze Ingestion
# MAGIC This notebook initializes the Medallion schemas and loads raw Parquet files from the Git repository into the Bronze layer.

In [0]:
# 1. Initialize Schemas

spark.sql("CREATE SCHEMA IF NOT EXISTS olist_raw_data")
spark.sql("CREATE SCHEMA IF NOT EXISTS olist_bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS olist_silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS olist_gold")

print("âœ… Schemas olist_raw_data, olist_bronze, olist_silver, and olist_gold are ready.")

In [0]:
#!!!! I HAD TO CREATE VOLUMES IN SCHEMA BEFORE I COULD MOVE DATA 
%sql

CREATE VOLUME olist_project.raw_data.raw_parquet_files;

In [0]:
# 1. Configuration
catalog = "olist_project"
bronze_schema = "bronze" # Make sure this schema exists
volume_path = "/Volumes/olist_project/raw_data/raw_parquet_files"

# Set context
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

In [0]:
# 2. Ingestion Loop
import os

# Get list of files from the Volume
files = [f for f in os.listdir(volume_path) if f.endswith(".parquet")]

for file in files:
    # Create clean table names (e.g., olist_orders_dataset -> olist_orders)
    table_name = file.replace(".parquet", "").replace("_dataset", "").replace("olist", "bronze")
    
    print(f"ðŸ“¦ Registering {table_name}...")
    
    # Read the parquet from Volume
    df = spark.read.parquet(f"{volume_path}/{file}")
    
    # Write as a Delta Table in the Bronze schema
    # 'overwrite' allows you to re-run this if you add new files later
    df.write.format("delta").mode("overwrite").saveAsTable(f"{bronze_schema}.{table_name}")

print("\nâœ… All tables are now available in olist_project.bronze!")

In [0]:
# DROPED ALL TABLES FROM SCHEMA
# tables = [row.tableName for row in spark.sql(f"SHOW TABLES IN {bronze_schema}").collect()]
# for table in tables:
#     spark.sql(f"DROP TABLE IF EXISTS {bronze_schema}.{table}")