In [None]:
%pip install boto3

In [None]:
import os
import re
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, lit

# ---------------------------
# CONFIGURATION
# ---------------------------
CONFIG = {
    "aws_access_key": "<your_access_key>",
    "aws_secret_key": "<your_secret_key>",
    "aws_session_token": "<your_session_token>",  # Optional
    "landing_zone": "s3://your-bucket/landing-zone/",
    "archive_zone": "s3://your-bucket/archive-zone/",
    "checkpoint_path": "/tmp/autoloader/checkpoints",
    "catalog": "entity_resolution_dev",
    "schema": "bronze"
}

# Set AWS credentials in environment
os.environ["AWS_ACCESS_KEY_ID"] = CONFIG["aws_access_key"]
os.environ["AWS_SECRET_ACCESS_KEY"] = CONFIG["aws_secret_key"]
os.environ["AWS_SESSION_TOKEN"] = CONFIG.get("aws_session_token", "")

# ---------------------------
# BOTO3 & SPARK SETUP
# ---------------------------
s3 = boto3.client("s3")
spark = SparkSession.builder.getOrCreate()

# ---------------------------
# HELPERS
# ---------------------------
def get_valid_tables(catalog, schema):
    return [t.name for t in spark.catalog.listTables(f"{catalog}.{schema}")]

def extract_table_name(file_path, valid_tables):
    file_name = os.path.basename(file_path).split('.')[0]
    for table in sorted(valid_tables, key=len, reverse=True):
        if file_name.startswith(table):
            return table
    return None

def list_s3_files(bucket, prefix):
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            if not key.endswith("/"):
                yield f"s3://{bucket}/{key}"

def move_s3_file(source_uri, destination_uri):
    src_bucket, src_key = re.match(r"s3://([^/]+)/(.+)", source_uri).groups()
    dest_bucket, dest_key = re.match(r"s3://([^/]+)/(.+)", destination_uri).groups()
    s3.copy_object(Bucket=dest_bucket, CopySource={"Bucket": src_bucket, "Key": src_key}, Key=dest_key)
    s3.delete_object(Bucket=src_bucket, Key=src_key)
    print(f"📦 Moved file to archive: {destination_uri}")

def build_read_options(file_format, checkpoint_subpath):
    options = {
        "cloudFiles.format": file_format,
        "cloudFiles.schemaLocation": checkpoint_subpath,
        "cloudFiles.inferColumnTypes": "true"
    }
    if file_format == "csv":
        options.update({"header": "true", "delimiter": ","})
    elif file_format == "txt":
        options.update({"header": "true", "delimiter": "\t"})
    return options

# ---------------------------
# MAIN INGESTION FUNCTION
# ---------------------------
def run_auto_loader_pipeline(config):
    print("🚀 Starting Auto Loader Pipeline...")

    catalog = config["catalog"]
    schema = config["schema"]
    valid_tables = get_valid_tables(catalog, schema)
    print("✅ Available tables:", valid_tables)

    landing_bucket = re.match(r"s3://([^/]+)/", config["landing_zone"]).group(1)
    landing_prefix = re.match(r"s3://[^/]+/(.+)", config["landing_zone"]).group(1)

    files = list(list_s3_files(landing_bucket, landing_prefix))

    for file_path in files:
        table_name = extract_table_name(file_path, valid_tables)
        if not table_name:
            print(f"❌ Skipping unrecognized file: {file_path}")
            continue

        file_format = file_path.split('.')[-1].lower()
        print(f"📥 Loading `{file_path}` → `{catalog}.{schema}.{table_name}`")

        read_options = build_read_options(file_format, f"{config['checkpoint_path']}/{table_name}/schema")

        df = (
            spark.read
            .format("cloudFiles")
            .options(**read_options)
            .load(file_path)
            .withColumn("load_timestamp", current_timestamp())
            .withColumn("source_file", lit(file_path))
        )

        (
            df.write
            .format("delta")
            .mode("append")
            .option("mergeSchema", "true")
            .saveAsTable(f"{catalog}.{schema}.{table_name}")
        )

        archive_path = file_path.replace(config["landing_zone"], config["archive_zone"])
        move_s3_file(file_path, archive_path)

    print("✅ Auto Loader Pipeline complete.")

# ---------------------------
# EXECUTE PIPELINE
# ---------------------------
run_auto_loader_pipeline(CONFIG)
