### Auto Loader Pipeline: Dynamic Schema Ingestion by Entity

In [None]:
# =============================================
# 📁 CONFIGURATION
# =============================================

config = {
    "s3_bucket": "your-s3-bucket-name",
    "landing_prefix": "bronze/landing/",
    "archive_prefix": "bronze/archive/",
    "bronze_schema": "bronze",
    "entity_regex": r"edm_(entity[a-zA-Z0-9]*)",
    "file_format": "auto",
    "file_header": "true",
    "checkpoint_path": "dbfs:/mnt/bronze/checkpoints/edm_entity"
}

config["source_path"] = f"s3a://{config['s3_bucket']}/{config['landing_prefix']}"
config["archive_path"] = f"s3a://{config['s3_bucket']}/{config['archive_prefix']}"

In [None]:
# ⚙️ IMPORTS

from pyspark.sql.functions import input_file_name, regexp_extract
from pyspark.sql.utils import AnalysisException

In [None]:
# ⚙️ LOAD STREAM FROM S3 USING AUTO LOADER

def load_stream(config):
    return (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", config["file_format"])
        .option("cloudFiles.inferColumnTypes", "true")
        .option("header", config["file_header"])
        .option("cloudFiles.includeExistingFiles", "true")
        .option("cloudFiles.archiveDir", config["archive_path"])
        .load(config["source_path"])
    )

In [None]:
# ⚙️ ENRICH DATA WITH ENTITY FROM FILENAME

def enrich_with_entity(df, config):
    df = df.withColumn("filename", input_file_name())
    df = df.withColumn("entity", regexp_extract("filename", config["entity_regex"], 1))
    return df

In [None]:
# ⚙️ WRITE DATA TO ENTITY-SPECIFIC DELTA TABLES

def write_entity_tables(bronze_schema):
    def writer(batch_df, batch_id):
        entity_names = batch_df.select("entity").distinct().collect()

        for row in entity_names:
            entity = row["entity"]
            if not entity:
                continue

            table_name = f"{bronze_schema}.{entity}"
            entity_df = batch_df.filter(batch_df["entity"] == entity).drop("filename", "entity")

            try:
                if not spark.catalog.tableExists(table_name):
                    (
                        entity_df.write
                        .format("delta")
                        .mode("overwrite")
                        .option("overwriteSchema", "true")
                        .saveAsTable(table_name)
                    )
                else:
                    (
                        entity_df.write
                        .format("delta")
                        .mode("append")
                        .saveAsTable(table_name)
                    )
            except AnalysisException as e:
                print(f"⚠️ Error writing {table_name}: {str(e)}")
    return writer

In [None]:
# 🚀 START STREAMING INGESTION PIPELINE

df_stream = load_stream(config)
df_enriched = enrich_with_entity(df_stream, config)

query = (
    df_enriched.writeStream
    .foreachBatch(write_entity_tables(config["bronze_schema"]))
    .option("checkpointLocation", config["checkpoint_path"])
    .start()
)

query.awaitTermination()