In [0]:
# Imports
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit, current_timestamp, trim
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
import uuid
import time


In [0]:
# Configuration with domain folder structure
CONTAINER = "mg-gold-raw-files"

# Domain-specific folder mapping
DOMAIN_FOLDER_MAP = {
    "Customer": "raw_customer",
    "Carrier": "raw_carrier",
    "Load Detail": "raw_load_details"
}

# Processing subfolders within each domain folder
PROCESSING_SUBFOLDERS = {
    "incoming": "incoming",
    "processed": "processed",
    "failed": "failed"
}

# Legacy config (kept for backward compatibility)
DOMAIN_FOLDER = "raw_load_details"
INCOMING_FOLDER = "incoming"
PROCESSED_FOLDER = "processed"

BRONZE_SCHEMA = "load_detail"
BRONZE_TABLE = "load_transactions"

In [0]:
# Bronze table configuration by domain
BRONZE_TABLE_CONFIG = {
    "Customer": {
        "schema": "customer",
        "table": "customer_master",
        "description": "Customer master data from MercuryGate Gold"
    },
    "Carrier": {
        "schema": "carrier",
        "table": "carrier_master",
        "description": "Carrier master data from MercuryGate Gold"
    },
    "Load Detail": {
        "schema": "load_detail",
        "table": "load_transactions",
        "description": "Load/shipment transaction details from MercuryGate Gold"
    }
}

def get_bronze_config(domain: str) -> dict:
    """Get bronze table config for domain"""
    if domain not in BRONZE_TABLE_CONFIG:
        raise ValueError(f"Unknown domain: {domain}")
    return BRONZE_TABLE_CONFIG[domain]

In [0]:
# WidgetManager - Dynamic widget creation based on domain

class WidgetManager:
    """Manages Databricks widgets with domain-specific behavior"""
    
    # Widget definitions
    WIDGET_DEFINITIONS = {
        "processing_option": {
            "type": "dropdown",
            "default": "Load Detail",
            "choices": ["Customer", "Carrier", "Load Detail"],
            "label": "Data Domain"
        },
        "file_pattern": {
            "type": "text",
            "default": "",
            "label": "File Pattern (optional)"
        },
        "customer_id": {
            "type": "text",
            "default": "",
            "label": "Customer ID (optional)",
            "domains": ["Customer", "Load Detail"]
        },
        "carrier_id": {
            "type": "text",
            "default": "",
            "label": "Carrier ID (optional)",
            "domains": ["Carrier", "Load Detail"]
        }
    }
    
    @staticmethod
    def create_widgets(domain: str = None):
        """Create widgets, optionally filtered by domain"""
        print("Creating widgets...")
        
        for widget_name, config in WidgetManager.WIDGET_DEFINITIONS.items():
            # Skip domain-specific widgets if not applicable
            if "domains" in config and domain and domain not in config["domains"]:
                continue
            
            # Create widget based on type
            if config["type"] == "dropdown":
                dbutils.widgets.dropdown(
                    widget_name,
                    config["default"],
                    config["choices"],
                    config["label"]
                )
            elif config["type"] == "text":
                dbutils.widgets.text(
                    widget_name,
                    config["default"],
                    config["label"]
                )
        
        print(f" Widgets created for domain: {domain if domain else 'All'}")
    
    @staticmethod
    def remove_all_widgets():
        """Remove all widgets"""
        try:
            dbutils.widgets.removeAll()
            print(" All widgets removed")
        except Exception as e:
            print(f"Could not remove widgets: {e}")
    
    @staticmethod
    def get_widget_values() -> dict:
        """Get all current widget values"""
        values = {}
        for widget_name in WidgetManager.WIDGET_DEFINITIONS.keys():
            try:
                values[widget_name] = dbutils.widgets.get(widget_name)
            except:
                values[widget_name] = None
        return values

In [0]:
# Create widgets using WidgetManager
WidgetManager.create_widgets()

Creating widgets...
 Widgets created for domain: All


In [0]:
# Simple file processing function with domain folder support
def process_csv_file(file_name: str, domain: str) -> DataFrame:
    """Process a single CSV file from domain-specific folder"""
    
    # Get domain folder
    if domain in DOMAIN_FOLDER_MAP:
        domain_folder = DOMAIN_FOLDER_MAP[domain]
        file_path = f"/mnt/{CONTAINER}/{domain_folder}/{PROCESSING_SUBFOLDERS['incoming']}/{file_name}"
    else:
        # Fallback to legacy path
        file_path = f"/mnt/{CONTAINER}/{DOMAIN_FOLDER}/{INCOMING_FOLDER}/{file_name}"
    
    print(f"Reading: {file_name}")
    print(f"  Path: {file_path}")
    
    # Detect file properties
    dbfs_path = f"/dbfs{file_path}"
    props = detect_file_properties(dbfs_path)
    
    # Read CSV with detected properties
    df = spark.read.format("csv") \
        .option("header", str(props['has_header']).lower()) \
        .option("inferSchema", "true") \
        .option("delimiter", props['delimiter']) \
        .option("quote", props['quotechar']) \
        .option("encoding", props['encoding']) \
        .load(file_path)
    
    print(f"  Rows: {df.count()}")
    
    # Sanitize column names
    df = sanitize_dataframe_columns(df)
    
    # Validate schema
    schema_valid, schema_warnings = validate_schema(df, domain, file_name)
    if not schema_valid:
        print(f"  ⚠ Schema validation warnings:")
        for warning in schema_warnings:
            print(f"    - {warning}")
    
    # Deduplicate
    df, duplicates_removed = deduplicate_dataframe(df)
    
    # Validate data quality
    quality_issues = validate_data_quality(df)
    
    # Perform detailed schema analysis
    schema_analysis = analyze_dataframe_schema(df, file_name)
    
    # Add metadata columns
    df = df.withColumn("_src_file", lit(file_name))
    df = df.withColumn("_ingestion_timestamp", current_timestamp())
    df = df.withColumn("_execution_id", lit(str(uuid.uuid4())))
    # duplicates_removed tracked in metadata, not in bronze table
    
    # Trim strings
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, trim(col(field.name)))
    
    # Return df with metadata for audit logging
    metadata = {
        'file_props': props,
        'schema_analysis': schema_analysis,
        'duplicates_removed': duplicates_removed,
        'quality_issues': quality_issues
    }
    
    return df, metadata

In [0]:
# Column sanitization functions
import re

def sanitize_column_name(name: str) -> str:
    """Sanitize column name for Delta tables"""
    sanitized = name.strip().lower()
    sanitized = sanitized.replace(' ', '_')
    sanitized = re.sub(r'[,;{}()\[\]]', '', sanitized)
    sanitized = re.sub(r'[^a-zA-Z0-9_.$-]', '_', sanitized)
    sanitized = re.sub(r'_+', '_', sanitized)
    sanitized = sanitized.strip('_')
    return sanitized

def sanitize_dataframe_columns(df: DataFrame) -> DataFrame:
    """Sanitize all column names in DataFrame"""
    original_cols = df.columns
    new_columns = [sanitize_column_name(col) for col in original_cols]
    
    # Check for duplicates
    if len(new_columns) != len(set(new_columns)):
        print("  ⚠ Warning: Duplicate column names after sanitization")
    
    # Show what changed
    changes = [(o, n) for o, n in zip(original_cols, new_columns) if o != n]
    if changes:
        print(f"  Sanitized {len(changes)} column names")
    
    return df.toDF(*new_columns)

In [0]:
# Data quality and deduplication functions

def deduplicate_dataframe(df: DataFrame) -> tuple:
    """Remove duplicate rows and return df with count of duplicates removed"""
    row_count_before = df.count()
    df_deduped = df.dropDuplicates()
    row_count_after = df_deduped.count()
    duplicates_removed = row_count_before - row_count_after
    
    if duplicates_removed > 0:
        print(f"  Removed {duplicates_removed} duplicate rows")
    else:
        print(f"  No duplicates found")
    
    return df_deduped, duplicates_removed

def validate_data_quality(df: DataFrame) -> list:
    """Basic data quality checks"""
    issues = []
    
    # Check if empty
    row_count = df.count()
    if row_count == 0:
        issues.append("DataFrame is empty")
    
    # Check for all-null columns
    for col_name in df.columns:
        if not col_name.startswith('_'):  # Skip metadata columns
            non_null = df.select(col_name).filter(col(col_name).isNotNull()).count()
            if non_null == 0:
                issues.append(f"Column '{col_name}' is all nulls")
    
    if issues:
        print(f"  ⚠ Data quality issues: {'; '.join(issues)}")
    else:
        print(f"   Data quality checks passed")
    
    return issues

In [0]:
# Schema Validation - Detects when MercuryGate changes export format
import logging
import sys

# Configure logger
logger = logging.getLogger("SchemaValidator")
if not logger.handlers:
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# Expected schemas by domain
EXPECTED_SCHEMAS = {
    "Load Detail": [
        "extract_timestamp", "extract_batch_id", "load_number", "load_created_date",
        "customer_name", "origin_city", "origin_state", "destination_city", "destination_state",
        "total_weight", "total_pieces", "equipment_type", "current_tender_status",
        "current_tender_carrier", "current_tender_version", "current_tender_timestamp",
        "tender_action_code", "tender_user", "load_status", "ready_date", "delivery_date"
    ],
    "Customer": [
        # Add customer schema columns here
    ],
    "Carrier": [
        # Add carrier schema columns here
    ]
}

def validate_schema(df: DataFrame, domain: str, file_name: str) -> tuple:
    """Validate DataFrame schema against expected schema for domain"""
    
    if domain not in EXPECTED_SCHEMAS or not EXPECTED_SCHEMAS[domain]:
        logger.info(f"No expected schema defined for domain: {domain}")
        return True, []
    
    expected_cols = set(EXPECTED_SCHEMAS[domain])
    actual_cols = set(df.columns)
    
    # Remove metadata columns from comparison
    actual_cols = {col for col in actual_cols if not col.startswith('_')}
    
    # Check for differences
    missing_cols = expected_cols - actual_cols
    extra_cols = actual_cols - expected_cols
    
    warnings = []
    
    if missing_cols:
        msg = f"Missing expected columns: {sorted(missing_cols)}"
        warnings.append(msg)
        logger.warning(f"{file_name}: {msg}")
    
    if extra_cols:
        msg = f"Extra unexpected columns: {sorted(extra_cols)}"
        warnings.append(msg)
        logger.warning(f"{file_name}: {msg}")
    
    if not warnings:
        logger.info(f"{file_name}: Schema validation passed")
        return True, []
    
    return False, warnings

In [0]:
# Advanced File Property Detection
import chardet
import csv
from pathlib import Path

def detect_file_encoding(file_path: str, sample_size: int = 100000) -> str:
    """Detect file encoding using chardet"""
    try:
        # Read sample of file
        with open(file_path, 'rb') as f:
            raw_data = f.read(sample_size)
        
        result = chardet.detect(raw_data)
        
        if result and result['confidence'] > 0.7:
            encoding = result['encoding']
            print(f"  Detected encoding: {encoding} (confidence: {result['confidence']:.2f})")
            return encoding
        else:
            print(f"  Low confidence encoding detection, using UTF-8")
            return 'utf-8'
    except Exception as e:
        print(f"  Error detecting encoding: {e}, using UTF-8")
        return 'utf-8'

def detect_csv_properties(file_path: str, encoding: str = 'utf-8') -> dict:
    """Detect CSV delimiter, quote char, and other properties"""
    try:
        with open(file_path, 'r', encoding=encoding) as f:
            # Read first few lines
            sample = ''.join([f.readline() for _ in range(5)])
        
        # Use CSV sniffer to detect dialect
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        has_header = sniffer.has_header(sample)
        
        properties = {
            'encoding': encoding,
            'delimiter': dialect.delimiter,
            'quotechar': dialect.quotechar,
            'has_header': has_header
        }
        
        print(f"  CSV Properties: delimiter={repr(properties['delimiter'])}, "
              f"quote={repr(properties['quotechar'])}, header={properties['has_header']}")
        
        return properties
        
    except Exception as e:
        print(f"  Could not detect CSV properties: {e}, using defaults")
        return {
            'encoding': encoding,
            'delimiter': ',',
            'quotechar': '"',
            'has_header': True
        }

def detect_file_properties(file_path: str) -> dict:
    """Detect all file properties: encoding, delimiter, etc."""
    print("  Detecting file properties...")
    
    # Detect encoding
    encoding = detect_file_encoding(file_path)
    
    # Detect CSV properties
    properties = detect_csv_properties(file_path, encoding)
    
    return properties

In [0]:
# Detailed Schema Analysis
from pyspark.sql import functions as F

def analyze_dataframe_schema(df: DataFrame, file_name: str) -> dict:
    """Perform detailed schema analysis with samples, nulls, and statistics"""
    
    print(f"\n  Analyzing schema for: {file_name}")
    print("  " + "-" * 100)
    
    total_rows = df.count()
    
    # Calculate null counts for all columns in one pass
    null_counts = df.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns
    ]).collect()[0].asDict()
    
    # Calculate numeric sums
    numeric_cols = []
    for col_name, dtype in df.dtypes:
        if any(t in dtype.lower() for t in ['int', 'long', 'double', 'float', 'decimal']):
            numeric_cols.append(col_name)
    
    numeric_sums = {}
    if numeric_cols:
        sums_row = df.agg(*(F.sum(c).alias(c) for c in numeric_cols)).collect()[0]
        numeric_sums = {c: float(sums_row[c]) if sums_row[c] is not None else 0 for c in numeric_cols}
    
    # Build schema details
    schema_details = []
    
    print(f"  {'Column':<30} {'Type':<12} {'Non-Null':<12} {'Null %':<8} {'Sum':<15} {'Samples'}")
    print("  " + "-" * 100)
    
    for col_name, dtype in df.dtypes:
        null_count = null_counts[col_name]
        non_null = total_rows - null_count
        null_pct = (null_count / total_rows * 100) if total_rows > 0 else 0
        
        # Get sample values (limit to 3 for display)
        samples = df.select(col_name).where(F.col(col_name).isNotNull()) \
            .distinct().limit(3).collect()
        sample_str = ', '.join([str(row[0])[:20] for row in samples])
        
        # Format sum for numeric columns
        sum_str = f"{numeric_sums[col_name]:,.2f}" if col_name in numeric_sums else ""
        
        print(f"  {col_name:<30} {dtype:<12} {non_null:<12,} {null_pct:>6.1f}% {sum_str:<15} {sample_str}")
        
        schema_details.append({
            'column': col_name,
            'type': dtype,
            'non_null_count': non_null,
            'null_count': null_count,
            'null_percentage': null_pct,
            'sum': numeric_sums.get(col_name),
            'samples': [str(row[0]) for row in samples]
        })
    
    print("  " + "-" * 100)
    print(f"  Total Rows: {total_rows:,} | Total Columns: {len(df.columns)}\n")
    
    return {
        'total_rows': total_rows,
        'total_columns': len(df.columns),
        'schema_details': schema_details
    }

In [0]:
# Performance metrics logging
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType
import time

def log_performance_metrics(file_name: str, file_size: int, row_count: int, 
                           duration_seconds: float, execution_id: str):
    """Log performance metrics"""
    
    metrics_table = "dev_bronze.bronze_audit.performance_metrics"
    
    # Create table if not exists
    spark.sql(f"""CREATE TABLE IF NOT EXISTS {metrics_table} (
        execution_id STRING,
        file_name STRING,
        file_size_mb DOUBLE,
        row_count LONG,
        duration_seconds DOUBLE,
        rows_per_second DOUBLE,
        mb_per_second DOUBLE,
        timestamp TIMESTAMP
    ) USING DELTA""")
    
    # Calculate metrics
    file_size_mb = file_size / (1024 * 1024) if file_size else 0
    rows_per_sec = row_count / duration_seconds if duration_seconds > 0 else 0
    mb_per_sec = file_size_mb / duration_seconds if duration_seconds > 0 else 0
    
    # Define schema
    schema = StructType([
        StructField("execution_id", StringType(), True),
        StructField("file_name", StringType(), True),
        StructField("file_size_mb", DoubleType(), True),
        StructField("row_count", LongType(), True),
        StructField("duration_seconds", DoubleType(), True),
        StructField("rows_per_second", DoubleType(), True),
        StructField("mb_per_second", DoubleType(), True),
        StructField("timestamp", TimestampType(), True)
    ])
    
    from datetime import datetime, timedelta
    metrics_df = spark.createDataFrame([
        (execution_id, file_name, file_size_mb, row_count, duration_seconds, 
         rows_per_sec, mb_per_sec, datetime.now())
    ], schema)
    
    metrics_df.write.mode("append").saveAsTable(metrics_table)
    
    print(f"   Performance: {row_count:,} rows in {duration_seconds:.2f}s ({rows_per_sec:.0f} rows/sec)")

In [0]:
# Write to bronze with smart schema evolution
def write_to_bronze(df: DataFrame, domain: str):
    """Write DataFrame to bronze table based on domain with schema evolution"""
    
    # Get config for domain
    config = get_bronze_config(domain)
    table_name = f"dev_bronze.{config['schema']}.{config['table']}"
    
    print(f"Writing to: {table_name}")
    
    # Create schema if needed
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS dev_bronze.{config['schema']}")
    
    # Check if table exists
    table_exists = spark.catalog.tableExists(table_name)
    
    if table_exists:
        # Get existing schema
        existing_df = spark.table(table_name).limit(0)  # Just schema, no data
        existing_cols = set(existing_df.columns)
        new_cols = set(df.columns)
        
        # Check for schema changes
        added_cols = new_cols - existing_cols
        dropped_cols = existing_cols - new_cols
        
        if added_cols or dropped_cols:
            print(f" Schema changes detected:")
            
            if added_cols:
                print(f" NEW columns (will be added): {sorted(added_cols)}")
                send_notification(
                    message=f"New columns detected in {domain}: {sorted(added_cols)}",
                    severity="INFO",
                    domain=domain
                )
            
            if dropped_cols:
                # Dropped columns are more concerning - send warning
                print(f" DROPPED columns (old data had these): {sorted(dropped_cols)}")
                send_notification(
                    message=f"WARNING: Columns dropped in {domain}: {sorted(dropped_cols)}",
                    severity="WARNING",
                    domain=domain
                )
        
        # Write with mergeSchema enabled
        df.write.mode("append").option("mergeSchema", "true").saveAsTable(table_name)
        print(f"   Written {df.count()} rows (schema auto-merged)")
    else:
        # First write - create table
        df.write.mode("append").saveAsTable(table_name)
        print(f" Written {df.count()} rows (table created)")
    
    return table_name

In [0]:
# Enhanced audit logging with complete details
def log_to_audit(file_name: str, status: str, rows: int, execution_id: str, 
                 file_info: dict = None, schema_analysis: dict = None, 
                 file_props: dict = None, processing_notes: dict = None):
    """Log file processing to audit table with complete details"""
    
    from datetime import datetime, timedelta
    from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType
    import json
    
    audit_table = "dev_bronze.bronze_audit.file_processing_audit"
    
    # Create schema
    spark.sql("CREATE SCHEMA IF NOT EXISTS dev_bronze.bronze_audit")
    
    # Create audit table
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {audit_table} (
            execution_id STRING,
            file_name STRING,
            file_size LONG,
            file_modified_timestamp TIMESTAMP,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            user_name STRING,
            status STRING,
            rows_processed LONG,
            schema_info STRING,
            file_properties STRING,
            processing_details STRING
        ) USING DELTA
    """)
    
    # Get current user
    try:
        user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
    except:
        user = "unknown"
    
    # Extract file info
    file_size = file_info.get('size') if file_info else None
    file_modified = file_info.get('modified') if file_info else None
    start_time = file_info.get('start_time') if file_info else datetime.now()
    
    # Convert complex objects to JSON
    schema_json = json.dumps(schema_analysis) if schema_analysis else None
    props_json = json.dumps(file_props) if file_props else None
    details_json = json.dumps(processing_notes) if processing_notes else None
    
    # Define schema
    schema = StructType([
        StructField("execution_id", StringType(), True),
        StructField("file_name", StringType(), True),
        StructField("file_size", LongType(), True),
        StructField("file_modified_timestamp", TimestampType(), True),
        StructField("start_time", TimestampType(), True),
        StructField("end_time", TimestampType(), True),
        StructField("user_name", StringType(), True),
        StructField("status", StringType(), True),
        StructField("rows_processed", LongType(), True),
        StructField("schema_info", StringType(), True),
        StructField("file_properties", StringType(), True),
        StructField("processing_details", StringType(), True)
    ])
    
    # Insert audit record
    now = datetime.now()
    
    audit_df = spark.createDataFrame([
        (execution_id, file_name, file_size, file_modified, start_time, now, 
         user, status, rows, schema_json, props_json, details_json)
    ], schema)
    
    audit_df.write.mode("append").saveAsTable(audit_table)
    
    print(f"   Logged to audit table")

In [0]:
# Notification system for operational alerts

def send_notification(message: str, severity: str = "INFO", domain: str = None, execution_id: str = None):
    """Send notification for important events (errors, warnings, anomalies)"""
    
    from datetime import datetime, timedelta
    from pyspark.sql.types import StructType, StructField, StringType, TimestampType
    
    notifications_table = "dev_bronze.bronze_audit.notifications"
    
    # Create table if not exists
    spark.sql(f"""CREATE TABLE IF NOT EXISTS {notifications_table} (
        timestamp TIMESTAMP,
        severity STRING,
        message STRING,
        domain STRING,
        execution_id STRING
    ) USING DELTA""")
    
    schema = StructType([
        StructField("timestamp", TimestampType(), True),
        StructField("severity", StringType(), True),
        StructField("message", StringType(), True),
        StructField("domain", StringType(), True),
        StructField("execution_id", StringType(), True)
    ])
    
    notif_df = spark.createDataFrame([
        (datetime.now(), severity, message, domain, execution_id)
    ], schema)
    
    notif_df.write.mode("append").saveAsTable(notifications_table)
    
    # Print with color coding
    if severity == "ERROR":
        print(f"  🔴 [{severity}] {message}")
    elif severity == "WARNING":
        print(f"  🟡 [{severity}] {message}")
    else:
        print(f"  ℹ️  [{severity}] {message}")

In [0]:
# Archive file to domain-specific folder
def archive_file(file_name: str, domain: str, status: str = "SUCCESS"):
    """Move file to processed or failed folder within domain"""
    
    from datetime import datetime, timedelta
    date_str = datetime.now().strftime("%Y%m%d")
    
    # Get domain folder
    if domain in DOMAIN_FOLDER_MAP:
        domain_folder = DOMAIN_FOLDER_MAP[domain]
        source = f"/mnt/{CONTAINER}/{domain_folder}/{PROCESSING_SUBFOLDERS['incoming']}/{file_name}"
        
        # Archive to processed or failed
        if status == "SUCCESS":
            dest_folder = f"{domain_folder}/{PROCESSING_SUBFOLDERS['processed']}"
        else:
            dest_folder = f"{domain_folder}/{PROCESSING_SUBFOLDERS['failed']}"
    else:
        # Fallback to legacy
        source = f"/mnt/{CONTAINER}/{DOMAIN_FOLDER}/{INCOMING_FOLDER}/{file_name}"
        dest_folder = PROCESSED_FOLDER if status == "SUCCESS" else "failed"
    
    dest = f"/mnt/{CONTAINER}/{dest_folder}/{date_str}/{file_name}"
    
    # Create folder
    dbutils.fs.mkdirs(f"/mnt/{CONTAINER}/{dest_folder}/{date_str}")
    
    # Move file
    dbutils.fs.mv(source, dest)
    
    print(f"   Archived to: {dest_folder}/{date_str}/")

In [0]:
# FileProcessor Class - Wraps all processing logic

class FileProcessor:
    """Processes files with all bronze layer features"""
    
    def __init__(self, execution_id: str = None):
        self.execution_id = execution_id or str(uuid.uuid4())
        self.start_time = time.time()
        print(f" FileProcessor initialized (Execution ID: {self.execution_id})")
    
    def process_file(self, file_name: str, domain: str) -> tuple:
        """Process a single file and return (success, df, row_count, duration)"""
        file_start = time.time()
        
        try:
            # Use the existing process_csv_file function
            result = process_csv_file(file_name, domain)
            
            # Unpack the tuple (df, metadata)
            if isinstance(result, tuple):
                df, metadata = result
            else:
                # Fallback if not a tuple
                df, metadata = result, {}
            row_count = df.count()
            duration = time.time() - file_start
            
            return (True, (df, metadata), row_count, duration)
        except Exception as e:
            print(f" Error processing {file_name}: {e}")
            duration = time.time() - file_start
            return (False, (None, {}), 0, duration)
    
    def process_batch(self, domain: str, file_pattern: str = None) -> dict:
        """Process all files in domain folder"""
        
        print("=" * 100)
        print("BRONZE LAYER BATCH INGESTION - FileProcessor")
        print("=" * 100)
        print(f"Domain: {domain}")
        print(f"File pattern: {file_pattern if file_pattern else 'All files'}")
        print(f"Execution ID: {self.execution_id}")
        
        # Get domain-specific incoming folder
        if domain in DOMAIN_FOLDER_MAP:
            domain_folder = DOMAIN_FOLDER_MAP[domain]
            incoming_path = f"/mnt/{CONTAINER}/{domain_folder}/{PROCESSING_SUBFOLDERS['incoming']}"
            print(f"Using domain folder: {domain_folder}")
        else:
            incoming_path = f"/mnt/{CONTAINER}/{DOMAIN_FOLDER}/{INCOMING_FOLDER}"
            print(f"Using legacy folder")
        
        # List files
        all_files = dbutils.fs.ls(incoming_path)
        csv_files = [f for f in all_files if f.name.endswith('.csv') and not f.isDir()]
        
        if file_pattern:
            csv_files = [f for f in csv_files if file_pattern in f.name]
        
        print(f"\nFound {len(csv_files)} CSV files")
        
        # Get already processed files
        try:
            processed_files_df = spark.sql(
                "SELECT DISTINCT file_name FROM dev_bronze.bronze_audit.file_processing_audit "
                "WHERE status = 'SUCCESS'"
            )
            processed_files = set([row.file_name for row in processed_files_df.collect()])
            print(f"Already processed: {len(processed_files)} files")
        except:
            processed_files = set()
            print("No audit history found")
        
        # Filter to unprocessed
        files_to_process = [f for f in csv_files if f.name not in processed_files]
        print(f"Files to process: {len(files_to_process)}")
        
        if len(files_to_process) == 0:
            print("\nAll files already processed!")
            return {'success_count': 0, 'fail_count': 0, 'table_name': None}
        
        # Process files
        print("\n" + "=" * 100)
        print("PROCESSING FILES")
        print("=" * 100)
        
        success_count = 0
        fail_count = 0
        table_name = None
        
        for idx, file_info in enumerate(files_to_process, 1):
            print(f"\n[{idx}/{len(files_to_process)}] Processing: {file_info.name}")
            print("-" * 100)
            
            success, result_tuple, row_count, duration = self.process_file(file_info.name, domain)
            
            # Unpack result
            if success and result_tuple:
                df, metadata = result_tuple
            else:
                df, metadata = None, {}
            
            if success:
                # Write to bronze
                table_name = write_to_bronze(df, domain)
                
                # Archive
                archive_file(file_info.name, domain, "SUCCESS")
                
                # Log performance
                try:
                    file_size = file_info.size
                except:
                    file_size = 0
                log_performance_metrics(file_info.name, file_size, row_count, duration, self.execution_id)
                
                # Log audit
                # Collect file metadata for audit
                file_metadata = {
                    'size': file_info.size,
                    'modified': datetime.fromtimestamp(file_info.modificationTime / 1000),
                    'start_time': datetime.now() - timedelta(seconds=duration)
                }
                
                # Log with complete details
                log_to_audit(
                    file_name=file_info.name,
                    status="SUCCESS",
                    rows=row_count,
                    execution_id=self.execution_id,
                    file_info=file_metadata,
                    schema_analysis=metadata.get('schema_analysis'),
                    file_props=metadata.get('file_props'),
                    processing_notes={
                        'duplicates_removed': metadata.get('duplicates_removed', 0),
                        'quality_issues': metadata.get('quality_issues', [])
                    }
                )
                
                # Send notifications for issues
                if metadata.get('quality_issues'):
                    send_notification(
                        message=f"Data quality issues in {file_info.name}: {'; '.join(metadata['quality_issues'])}",
                        severity="WARNING",
                        domain=domain,
                        execution_id=self.execution_id
                    )
                
                print(f"   SUCCESS")
                success_count += 1
            else:
                # Archive to failed
                archive_file(file_info.name, domain, "FAILED")
                log_to_audit(file_info.name, "FAILED", 0, self.execution_id)
                
                # Send error notification
                send_notification(
                    message=f"Failed to process {file_info.name}",
                    severity="ERROR",
                    domain=domain,
                    execution_id=self.execution_id
                )
                
                fail_count += 1
        
        print("\n" + "=" * 100)
        print("BATCH PROCESSING COMPLETE")
        print("=" * 100)
        print(f"Successfully processed: {success_count}")
        print(f"Failed: {fail_count}")
        
        # Alert on high failure rate
        if fail_count > 0 and fail_count >= success_count:
            send_notification(
                message=f"High failure rate in {domain}: {fail_count} failed, {success_count} succeeded",
                severity="ERROR",
                domain=domain,
                execution_id=self.execution_id
            )
        
        return {
            'success_count': success_count,
            'fail_count': fail_count,
            'table_name': table_name
        }

In [0]:
# Main execution using FileProcessor class
import uuid
import time

# Get widget values
widget_values = WidgetManager.get_widget_values()
domain = widget_values.get("processing_option", "Load Detail")
file_pattern = widget_values.get("file_pattern", "")

# Create FileProcessor and run batch processing
processor = FileProcessor()
results = processor.process_batch(domain, file_pattern)

# Display results if files were processed
if results['table_name']:
    print("\nBronze table contents:")
    display(spark.table(results['table_name']).orderBy(col("_ingestion_timestamp").desc()).limit(20))
    
    print("\nAudit table:")
    display(spark.sql("""
        SELECT file_name, status, rows_processed, start_time 
        FROM dev_bronze.bronze_audit.file_processing_audit 
        ORDER BY start_time DESC LIMIT 10
    """))


 FileProcessor initialized (Execution ID: e9ae7fb7-83db-425d-83f0-d6288edb3379)
BRONZE LAYER BATCH INGESTION - FileProcessor
Domain: Load Detail
File pattern: All files
Execution ID: e9ae7fb7-83db-425d-83f0-d6288edb3379
Using domain folder: raw_load_details

Found 9 CSV files
No audit history found
Files to process: 9

PROCESSING FILES

[1/9] Processing: mg_gold_extract_20260105_1000.csv
----------------------------------------------------------------------------------------------------
Reading: mg_gold_extract_20260105_1000.csv
  Path: /mnt/mg-gold-raw-files/raw_load_details/incoming/mg_gold_extract_20260105_1000.csv
  Detecting file properties...
  Detected encoding: ascii (confidence: 1.00)
  CSV Properties: delimiter=',', quote='"', header=True
  Rows: 50
2026-02-03 20:42:00 - INFO - mg_gold_extract_20260105_1000.csv: Schema validation passed
  No duplicates found
   Data quality checks passed

  Analyzing schema for: mg_gold_extract_20260105_1000.csv
  ----------------------------

extract_timestamp,extract_batch_id,load_number,load_created_date,customer_name,origin_city,origin_state,destination_city,destination_state,total_weight,total_pieces,equipment_type,current_tender_status,current_tender_carrier,current_tender_version,current_tender_timestamp,tender_action_code,tender_user,load_status,ready_date,delivery_date,_src_file,_ingestion_timestamp,_execution_id,new_column_added_by_mercurygate
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990004LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990015LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990017LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990002LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990009LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990013LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990007LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990014LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990011LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,
2026-01-05T14:00:00Z,MG_EXTRACT_20260105_1400,53990008LN,2026-01-05T10:00:00Z,TEST CUSTOMER,Dallas,TX,Houston,TX,25000,15,Van,ACCEPTED,TEST CARRIER,176770000000,2026-01-05T11:00:00Z,A,test@test.com,Booked,2026-01-06T08:00:00Z,2026-01-07T17:00:00Z,mg_gold_extract_with_duplicates.csv,2026-02-03T20:45:33.534577Z,4ef76ac4-8880-43c9-b675-12dae212a294,



Audit table:


file_name,status,rows_processed,start_time
mg_gold_extract_with_duplicates.csv,SUCCESS,20,2026-02-03T20:45:20.368594Z
mg_gold_extract_schema_change.csv,SUCCESS,15,2026-02-03T20:44:54.387008Z
mg_gold_extract_20260105_1800.csv,SUCCESS,50,2026-02-03T20:44:29.484884Z
mg_gold_extract_20260105_1600.csv,SUCCESS,50,2026-02-03T20:44:05.447454Z
mg_gold_extract_20260105_1400.csv,SUCCESS,50,2026-02-03T20:43:42.344011Z
mg_gold_extract_20260105_1240.csv,SUCCESS,3,2026-02-03T20:43:19.506423Z
mg_gold_extract_20260105_1220.csv,SUCCESS,3,2026-02-03T20:42:56.729188Z
mg_gold_extract_20260105_1200.csv,SUCCESS,50,2026-02-03T20:42:32.303986Z
mg_gold_extract_20260105_1000.csv,SUCCESS,50,2026-02-03T20:42:05.982885Z
