# Bronze Layer: Ingest Policies CSV to Delta
Azure Synapse Analytics - Medallion Architecture

**Pattern**: Read all columns as strings (Medallion best practice)

In [None]:
import sys, subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "pyyaml", "-q"], check=False)
import yaml

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, lit, to_date, col, input_file_name
import logging
import yaml
import uuid

In [None]:
# Configuration - ADLS Gen2 paths (abfss)
STORAGE_ACCOUNT = "<storage-account-name>"
FILES_ROOT = f"abfss://files@{STORAGE_ACCOUNT}.dfs.core.windows.net"
TABLES_ROOT = f"abfss://tables@{STORAGE_ACCOUNT}.dfs.core.windows.net"

SOURCE_PATH = f"{FILES_ROOT}/samples/batch/policies.csv"
TARGET_PATH = f"{TABLES_ROOT}/bronze/bronze_policies"
PARTITION_COLUMN = "ingestion_date"
SCHEMA_PATH = f"{FILES_ROOT}/config/schemas/bronze/bronze_policies.yaml"

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
def validate_schema(df, schema_path):
    """Simplified inline schema validation."""
    try:
        schema_content = spark.read.text(schema_path, wholetext=True).collect()[0][0]
        schema = yaml.safe_load(schema_content)
        
        for col_def in schema['required_columns']:
            col_name = col_def['name']
            nullable = col_def['nullable']
            
            if col_name not in df.columns:
                raise ValueError(f"Missing required column: {col_name}")
            
            if not nullable:
                null_count = df.filter(col(col_name).isNull()).count()
                if null_count > 0:
                    logger.warning(f"Found {null_count} null values in non-nullable column: {col_name}")
        
        logger.info("✓ Schema validation passed")
        return True
    except Exception as e:
        logger.warning(f"Schema validation skipped: {str(e)}")
        return True

In [None]:
def main():
    """Ingest policies from CSV to Bronze Delta table."""
    
    spark = SparkSession.builder.getOrCreate()
    
    try:
        logger.info(f"Reading policies from {SOURCE_PATH}")
        df = spark.read.format("csv") \
            .option("header", "true") \
            .option("inferSchema", "false") \
            .load(SOURCE_PATH)
        
        process_id = str(uuid.uuid4())
        
        df_enriched = df \
            .withColumn("ingestion_timestamp", current_timestamp()) \
            .withColumn("ingestion_date", to_date(current_timestamp())) \
            .withColumn("source_system", lit("legacy_csv")) \
            .withColumn("process_id", lit(process_id)) \
            .withColumn("source_file_name", input_file_name())
        
        record_count = df_enriched.count()
        logger.info(f"Read {record_count} policies")
        
        validate_schema(df_enriched, SCHEMA_PATH)
        
        logger.info(f"Writing to {TARGET_PATH}")
        df_enriched.write \
            .format("delta") \
            .mode("append") \
            .partitionBy(PARTITION_COLUMN) \
            .option("mergeSchema", "true") \
            .save(TARGET_PATH)
        
        logger.info("✓ Bronze policies ingestion completed")
        
    except Exception as e:
        logger.error(f"✗ Policies ingestion failed: {str(e)}")
        raise

In [None]:
main()