In [None]:
from pyspark.sql.functions import input_file_name
import re
import boto3
from urllib.parse import urlparse

# -------------------------
# Configurations
# -------------------------
source_path = "s3://your-bucket/raw/"
archive_path = "s3://your-bucket/archive/"
checkpoint_path = "s3://your-bucket/checkpoints/"
bronze_db = "bronze"

# -------------------------
# Extract clean table name from file path
# edm_entity1.txt → edm_entity
# -------------------------
def extract_table_name(file_path):
    match = re.search(r'/([^/]+)\.txt$', file_path)
    if match:
        filename = match.group(1)
        table_name = re.sub(r'\d+$', '', filename)  # Remove trailing digits
        return table_name
    return "unknown_table"

# -------------------------
# Move file from raw → archive
# -------------------------
def move_to_archive_s3(source_url, archive_url):
    s3 = boto3.client("s3")
    source = urlparse(source_url)
    archive = urlparse(archive_url)

    s3.copy_object(
        Bucket=archive.netloc,
        CopySource={'Bucket': source.netloc, 'Key': source.path.lstrip("/")},
        Key=archive.path.lstrip("/")
    )
    s3.delete_object(Bucket=source.netloc, Key=source.path.lstrip("/"))

# -------------------------
# Step 1: Read with Auto Loader
# -------------------------
raw_df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "text")
    .option("cloudFiles.schemaLocation", checkpoint_path + "schema/")
    .load(source_path)
)

df_with_file = raw_df.withColumn("input_file", input_file_name())

# -------------------------
# Step 2: Route to table and archive
# -------------------------
def process_and_archive(batch_df, batch_id):
    if batch_df.isEmpty():
        return

    file_path = batch_df.select("input_file").first()["input_file"]
    base_table = extract_table_name(file_path)
    full_table = f"{bronze_db}.{base_table}"
    bronze_table_path = f"/mnt/bronze/{base_table}"

    batch_df.drop("input_file").write.format("delta") \
        .mode("append") \
        .option("mergeSchema", "true") \
        .save(bronze_table_path)

    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {full_table}
        USING DELTA
        LOCATION '{bronze_table_path}'
    """)

    archive_file_path = file_path.replace("/raw/", "/archive/")
    move_to_archive_s3(file_path, archive_file_path)

# -------------------------
# Step 3: Start Auto Loader
# -------------------------
(
    df_with_file.writeStream
    .foreachBatch(process_and_archive)
    .option("checkpointLocation", checkpoint_path + "stream/")
    .start()
)