# 📥 Databricks Auto Loader Pipeline with Archiving to S3


This notebook implements a complete Databricks Auto Loader pipeline using PySpark.
It ingests JSON files from an S3 bucket into a Delta Lake Bronze table, and moves the processed files
to an archive location within the same bucket, grouped by prefix.


In [None]:

# Configuration
source_path = "s3://your-bucket-name/data/"
bronze_table_path = "/mnt/bronze/events/"
checkpoint_path = "/mnt/bronze/checkpoints/events/"
archive_path = "s3://your-bucket-name/archive/"
file_format = "json"


## 🔁 Step 1: Read streaming data from S3 using Auto Loader

In [None]:

from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StringType

# Define schema explicitly (optional)
schema = StructType().add("id", StringType()).add("value", StringType())

# Load streaming data from S3
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", file_format)
    .option("cloudFiles.includeExistingFiles", "true")
    .schema(schema)
    .load(source_path)
    .withColumn("source_file", input_file_name())
)

display(df)


## 💾 Step 2: Write to Bronze Delta Table

In [None]:

# Write streaming data to Delta table
(
    df.writeStream.format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .start(bronze_table_path)
)


## 🗂️ Step 3: Archive processed files by prefix using Boto3

In [None]:

import boto3
import os
from urllib.parse import urlparse

# Function to archive files by prefix
def archive_processed_files(bucket: str, source_prefix: str, archive_prefix: str):
    s3 = boto3.client("s3")
    response = s3.list_objects_v2(Bucket=bucket, Prefix=source_prefix)
    if "Contents" not in response:
        print("No files to archive.")
        return

    for obj in response["Contents"]:
        key = obj["Key"]
        filename = os.path.basename(key)
        prefix = filename.split("_")[0] if "_" in filename else "misc"
        dest_key = f"{archive_prefix}{prefix}/{filename}"

        # Copy then delete
        s3.copy_object(Bucket=bucket, CopySource={"Bucket": bucket, "Key": key}, Key=dest_key)
        s3.delete_object(Bucket=bucket, Key=key)
        print(f"Moved {key} → {dest_key}")


## ✅ Step 4: Call the archive function

In [None]:

# Set S3 bucket and prefixes
bucket = "your-bucket-name"
source_prefix = "data/"
archive_prefix = "archive/"

# Archive processed files
archive_processed_files(bucket, source_prefix, archive_prefix)


## 🧪 (Optional) Step 5: Simulate sample ingestion

In [None]:
# You can use dbutils.fs.cp or upload a file manually to your source_path to test the flow.