# 📦 Simple Databricks Auto Loader Pipeline
This notebook demonstrates a simple Auto Loader pipeline that:
- Loads data from S3 (any supported format)
- Extracts the target table name from the file name (e.g., `edm_entity_2024-06-01.csv` → `edm_entity`)
- Performs checkpointing and schema inference
- Adds metadata columns like source file and load timestamp
- Moves processed files to an S3 archive zone
- Only writes to tables that exist in the Bronze schema of the `entity_resolution_dev` catalog

In [None]:
from pyspark.sql.functions import input_file_name, current_timestamp, lit
import re

# ----------------------------------------
# CONFIGURATION
# ----------------------------------------
landing_zone     = 's3://your-bucket/landing-zone/'
archive_zone     = 's3://your-bucket/archive-zone/'
catalog          = 'entity_resolution_dev'
schema           = 'bronze'
checkpoint_path  = '/tmp/autoloader/checkpoints'

# ----------------------------------------
# GET LIST OF EXISTING BRONZE TABLES
# ----------------------------------------
tables = [t.name for t in spark.catalog.listTables(f'{catalog}.{schema}')]
print('✅ Available Bronze tables:', tables)


In [None]:
# ----------------------------------------
# HELPERS
# ----------------------------------------
def extract_table_name(file_path):
    """
    Extract table name from filename. 
    e.g., s3://.../edm_entity_2024-06-01.csv → edm_entity
    """
    match = re.search(r'/([a-zA-Z0-9_]+)[^/]*\.', file_path)
    return match.group(1) if match else None

def detect_format(file_path):
    """
    Detect file format from extension
    """
    ext = file_path.split('.')[-1].lower()
    return ext if ext in ['csv', 'json', 'parquet', 'txt'] else 'csv'

print('✅ Configuration complete. Ready to process incoming files.')


In [None]:
# ----------------------------------------
# MAIN: PROCESS EACH NEW FILE IN LANDING ZONE
# ----------------------------------------
files = dbutils.fs.ls(landing_zone)

for file in files:
    file_path = file.path
    table_name = extract_table_name(file_path)

    if not table_name:
        print(f"⚠️ Could not extract table name from {file_path}. Skipping.")
        continue

    if table_name not in tables:
        print(f"⚠️ Table {table_name} not found in {catalog}.{schema}. Skipping.")
        continue

    file_format = detect_format(file_path)
    print(f"🔄 Processing: {file_path} as {file_format} into {catalog}.{schema}.{table_name}")

    # Auto Loader options
    options = {
        'cloudFiles.format': file_format,
        'cloudFiles.schemaLocation': f'{checkpoint_path}/{table_name}/schema',
        'cloudFiles.inferColumnTypes': 'true'
    }

    # File-type-specific tweaks
    if file_format == 'csv':
        options['header'] = 'true'
        options['delimiter'] = ','
    elif file_format == 'txt':
        options['header'] = 'true'
        options['delimiter'] = '\t'

    # Read file with Auto Loader
    df = (
        spark.read
        .format('cloudFiles')
        .options(**options)
        .load(file_path)
        .withColumn('source_file', lit(file_path))
        .withColumn('load_timestamp', current_timestamp())
    )

    # Write to Bronze Delta Table
    (
        df.write
        .format('delta')
        .mode('append')
        .option('mergeSchema', 'true')
        .saveAsTable(f'{catalog}.{schema}.{table_name}')
    )

    # Archive the file
    archive_path = archive_zone + file_path.replace(landing_zone, '')
    dbutils.fs.mv(file_path, archive_path)
    print(f"✅ Archived: {file_path} → {archive_path}")

print('🎉 Auto Loader pipeline execution complete.')
