# 🚀 Databricks Auto Loader Pipeline with Temporary AWS Credentials and S3 Archiving


This notebook securely retrieves temporary AWS credentials from a custom identity service,
injects them into Spark for Auto Loader access, streams JSON files from S3 to a Bronze Delta table,
and archives the processed files using `boto3`.


## 🔐 Step 1: Retrieve AWS Credentials Securely

In [None]:

import boto3
import requests
import traceback
from datetime import datetime
from typing import Tuple
from tenacity import retry, wait_random_exponential, stop_after_attempt

@retry(wait=wait_random_exponential(multiplier=1, max=10), stop=stop_after_attempt(3))
def get_aws_token_info(
    aws_username: str,
    aws_account_id: str,
    aws_role_name: str,
    aws_password: str,
    ca_bundle_file: str
) -> dict:
    try:
        response = requests.post(
            url="https://epi.v4ic-identity.ssov.factset.com/creds",
            json={
                "username": aws_username,
                "accountId": aws_account_id,
                "roleName": aws_role_name,
                "password": aws_password
            },
            headers={"Content-Type": "application/json"},
            verify=ca_bundle_file,
        )
        if response.status_code != 200:
            raise Exception(f"Failed to retrieve AWS credentials: {response.text}")
        return response.json()
    except Exception:
        print(traceback.format_exc())
        raise

def get_client(service, aws_username, aws_account_id, aws_role_name, aws_password, ca_bundle_file, region_name="us-east-1") -> Tuple[boto3.client, datetime]:
    creds = get_aws_token_info(aws_username, aws_account_id, aws_role_name, aws_password, ca_bundle_file)
    expire_time = datetime.strptime(creds["awsExpires"], "%Y-%m-%dT%H:%M:%SZ")
    client = boto3.client(
        service,
        aws_access_key_id=creds["awsAccessKey"],
        aws_secret_access_key=creds["awsSecretKey"],
        aws_session_token=creds["awsSessionToken"],
        region_name=region_name
    )
    return client, expire_time

def get_s3_client(aws_username, aws_account_id, aws_role_name, aws_password, ca_bundle_file):
    return get_client("s3", aws_username, aws_account_id, aws_role_name, aws_password, ca_bundle_file)


## 🔧 Step 2: Configure Secrets and Spark Session

In [None]:

# Replace with Databricks secrets or manual values for testing
AWS_USERNAME = dbutils.secrets.get("my_scope", "aws_username")
AWS_PASSWORD = dbutils.secrets.get("my_scope", "aws_password")
AWS_ACCOUNT_ID = dbutils.secrets.get("my_scope", "aws_account_id")
AWS_ROLE_NAME = dbutils.secrets.get("my_scope", "aws_role_name")
CA_BUNDLE_FILE = dbutils.secrets.get("my_scope", "ca_bundle_file")

# Get S3 client and expiry time
s3_client, s3_expiration_time = get_s3_client(
    aws_username=AWS_USERNAME,
    aws_account_id=AWS_ACCOUNT_ID,
    aws_role_name=AWS_ROLE_NAME,
    aws_password=AWS_PASSWORD,
    ca_bundle_file=CA_BUNDLE_FILE
)

# Inject credentials into Spark session
def inject_aws_creds_into_spark(creds):
    spark.conf.set("fs.s3a.access.key", creds["awsAccessKey"])
    spark.conf.set("fs.s3a.secret.key", creds["awsSecretKey"])
    spark.conf.set("fs.s3a.session.token", creds["awsSessionToken"])

aws_creds = get_aws_token_info(AWS_USERNAME, AWS_ACCOUNT_ID, AWS_ROLE_NAME, AWS_PASSWORD, CA_BUNDLE_FILE)
inject_aws_creds_into_spark(aws_creds)


## 📥 Step 3: Read Stream from S3 Using Auto Loader

In [None]:

from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StringType

bucket_name = "fdss3-entity-resolution-data-prod"
subdirectory = "input/"
source_path = f"s3a://{bucket_name}/{subdirectory}"

schema = StructType().add("id", StringType()).add("value", StringType())

df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.includeExistingFiles", "true")
    .schema(schema)
    .load(source_path)
    .withColumn("source_file", input_file_name())
)

df.display()


## 💾 Step 4: Write to Bronze Delta Table

In [None]:

bronze_table_path = "/mnt/bronze/entity_data"
checkpoint_path = "/mnt/bronze/checkpoints/entity_data"

(
    df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .start(bronze_table_path)
)


## 📦 Step 5: Archive Processed Files Using boto3

In [None]:

import os

def archive_files(s3_client, bucket_name, source_prefix, archive_prefix):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=source_prefix)
    for obj in response.get("Contents", []):
        key = obj["Key"]
        filename = os.path.basename(key)
        if not filename:
            continue
        file_prefix = filename.split("_")[0]
        dest_key = f"{archive_prefix}{file_prefix}/{filename}"
        s3_client.copy_object(Bucket=bucket_name, CopySource={"Bucket": bucket_name, "Key": key}, Key=dest_key)
        s3_client.delete_object(Bucket=bucket_name, Key=key)
        print(f"Archived: {key} → {dest_key}")

# Example usage (manual trigger after stream)
archive_files(s3_client, bucket_name, "input/", "archive/")


## ✅ Summary


- Temporary AWS credentials are fetched securely from a custom identity service
- Credentials are injected into Spark for Auto Loader to access S3
- Data is streamed into a Bronze Delta table using `cloudFiles`
- Processed files are archived to structured S3 folders using `boto3`
