## Bronze Layer Ingestion Notebook
#### Goal is to ingest data from raw data volume into a bronze table as-is while ensuring:
###### * idempotency 
###### * incremental load
###### * No duplicate loading of the same file

In [0]:
# import col to rename columns
from pyspark.sql.functions import current_timestamp, col

# import urllib to decode file paths
from urllib.parse import unquote


#### Run Config Notebook

In [0]:
%run ../../configs/config_notebook

In [0]:
# Use configuration variables
VOLUME_PATH = VOLUMES["raw_data"]
TARGET_TABLE = TABLES["orders_bronze"]

In [0]:
# Get list of all files in the volume
raw_files = dbutils.fs.ls(VOLUME_PATH)

# filter csv files
raw_csv_files = [file.path for file in raw_files if file.path.endswith(".csv")]

# display list of csv files
print(f"Found {len(raw_csv_files)} CSV files in the {VOLUME_PATH} volume")
display(raw_csv_files)


#### Idempotency
###### * Manually tracking which source files have been processed
###### * In a production scenario, recommended to use Autoloader with checkpoints

In [0]:
# Get already processed files
processed_files = []

# check if bronze table already exists
if spark.catalog.tableExists(TARGET_TABLE):
    # read table to find unique file paths
    processed_files_df = (
        spark.table(TARGET_TABLE)
        .select("source_file")
        .distinct()
    )
    # convert to a python list of processed files
    processed_files = [unquote(row.source_file) for row in processed_files_df.collect()]

    # display list of processed files
    print(f"Found {len(processed_files)} processed files in the bronze table")
    display(processed_files)
else:
    print("No processed files found, bronze table does not exist")

In [0]:
# identify new files to process
new_files = [file for file in raw_csv_files if file not in processed_files]
    
# display list of new files 
print(f"Found {len(new_files)} new files to process")
display(new_files)

### Incremental Loading
###### * Only new files are loaded to bronze table
###### * Using append to add new data

In [0]:
# process only new files
if new_files:
    print("Processing new files")
    df = (
        spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .option("quote", '"')
        .option("escape", '"')
        .option("multiLine", "true")
        .load(new_files)
    )

    # clean column names, replace spaces and special characters with underscores
    for old_columns in df.columns:
        new_columns = (
            old_columns
            .replace(" ", "_")
            .replace("-", "_")
            .lower()
        )
        df = df.withColumnRenamed(old_columns, new_columns)

    # add metadata columns for auditing. This helps with files observability in the pipeline
    df_bronze = (df
        .withColumn("ingestion_timestamp", current_timestamp())
        .withColumn("source_file", col("_metadata.file_path"))
    )

    # write dataframe as delta table in the bronze table using append mode
    (df_bronze.write
        .format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )

    print(f"âœ… Loaded {df_bronze.count()} new records to {TARGET_TABLE}")
    display(df_bronze.head())
else:
    print("No new files to process")