# 🔐 Databricks Auto Loader with Temporary AWS Credentials and Archiving


This notebook implements a secure and modular pipeline in Databricks that:
- Fetches temporary AWS credentials from a custom identity service
- Injects them into Spark config for S3 Auto Loader access
- Streams data into a Bronze Delta table using Auto Loader
- Archives processed files using `boto3`
- Supports credential refresh for long-running jobs


## 🔧 Configuration

In [None]:

import os
import requests
import boto3
from datetime import datetime, timedelta

# Identity service configuration
identity_url = "https://epi.v4ic-identity.ssov.factset.com/creds"
aws_username = dbutils.secrets.get("my_scope", "aws_username")
aws_password = dbutils.secrets.get("my_scope", "aws_password")
aws_account_id = "123456789012"
aws_role_name = "my-role"
ca_bundle_file = "/dbfs/path/to/ca.pem"  # Optional

# S3 paths
bucket = "your-bucket"
source_prefix = "input/"
archive_prefix = "archive/"


## 🔑 Helper: Fetch Temporary AWS Credentials

In [None]:

def get_temp_aws_creds():
    payload = {
        "aws_username": aws_username,
        "aws_password": aws_password,
        "aws_account_id": aws_account_id,
        "aws_role_name": aws_role_name,
    }
    response = requests.post(identity_url, json=payload, verify=ca_bundle_file)
    response.raise_for_status()
    data = response.json()

    creds = {
        "AWS_ACCESS_KEY_ID": data["awsAccessKey"],
        "AWS_SECRET_ACCESS_KEY": data["awsSecretKey"],
        "AWS_SESSION_TOKEN": data["awsSessionToken"],
        "AWS_EXPIRATION": data["awsExpires"],
    }
    return creds


## 🚀 Inject Credentials into Spark Config

In [None]:

def inject_aws_creds_into_spark(creds: dict):
    spark.conf.set("fs.s3a.access.key", creds["AWS_ACCESS_KEY_ID"])
    spark.conf.set("fs.s3a.secret.key", creds["AWS_SECRET_ACCESS_KEY"])
    spark.conf.set("fs.s3a.session.token", creds["AWS_SESSION_TOKEN"])


## ♻️ Credential Refresh Logic

In [None]:

aws_creds = get_temp_aws_creds()
inject_aws_creds_into_spark(aws_creds)

def is_token_expired(creds: dict, buffer_minutes=5):
    expiry = datetime.strptime(creds["AWS_EXPIRATION"], "%Y-%m-%dT%H:%M:%SZ")
    return datetime.utcnow() + timedelta(minutes=buffer_minutes) > expiry


## 📥 Read From S3 Using Auto Loader with Temporary Credentials

In [None]:

from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StringType

# Optional schema
schema = StructType().add("id", StringType()).add("value", StringType())

source_path = f"s3a://{bucket}/{source_prefix}"
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.includeExistingFiles", "true")
    .schema(schema)
    .load(source_path)
    .withColumn("source_file", input_file_name())
)


## 💾 Write to Bronze Delta Table

In [None]:

checkpoint_path = "/mnt/bronze/checkpoints/"
bronze_table_path = "/mnt/bronze/data/"

(
    df.writeStream.format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .start(bronze_table_path)
)


## 📦 Archive Processed Files Using boto3

In [None]:

def archive_files(creds, bucket, source_prefix, archive_prefix):
    s3 = boto3.client(
        "s3",
        aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"],
        aws_session_token=creds["AWS_SESSION_TOKEN"],
    )

    resp = s3.list_objects_v2(Bucket=bucket, Prefix=source_prefix)
    for obj in resp.get("Contents", []):
        key = obj["Key"]
        filename = os.path.basename(key)
        file_prefix = filename.split("_")[0]
        dest_key = f"{archive_prefix}{file_prefix}/{filename}"
        s3.copy_object(Bucket=bucket, CopySource={"Bucket": bucket, "Key": key}, Key=dest_key)
        s3.delete_object(Bucket=bucket, Key=key)
        print(f"Archived {key} → {dest_key}")


## ✅ Summary


- Temporary AWS credentials securely retrieved from identity service
- Injected into Spark config for Auto Loader to access S3
- Optional refresh logic checks if token is near expiry
- Files are archived to a prefix-based S3 structure using `boto3`

> You can modularize these helpers into a shared module for reuse across notebooks or jobs.
