# 🧪 Delta Lake Streaming Pipeline
This notebook demonstrates a structured data ingestion pipeline using Apache Spark Structured Streaming.
- Reads data from S3
- Writes to Delta Lake tables
- Supports merge/upsert and overwrite strategies
- Includes validation and performance logging

In [None]:
import logging
from datetime import datetime
from typing import List, Optional
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import current_timestamp

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('DeltaIngest')

## 🔧 Configuration Class

In [None]:
class StreamConfig:
    def __init__(self):
        self.catalog = "hive_metastore"
        self.schema = "bronze"
        self.table_name = "entity_table"
        self.source_path = "s3a://your-bucket/entity-data/"
        self.checkpoint_path = "s3a://your-bucket/checkpoints/entity"
        self.merge_keys = ["entity_id"]

## 🚀 Spark Session

In [None]:
def setup_spark():
    spark = SparkSession.builder \
        .appName("DeltaLakeIngest") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    return spark

## ✅ Validation Logic

In [None]:
class DataValidation:
    def __init__(self, merge_keys: List[str]):
        self.merge_keys = merge_keys

    def validate(self, df: DataFrame) -> bool:
        for key in self.merge_keys:
            if df.filter(df[key].isNull() | (df[key] == '')).count() > 0:
                logger.warning(f"Primary key '{key}' has null or empty values")
                return False
        return True

## 🔁 Merge Logic

In [None]:
def merge_to_delta(spark: SparkSession, df: DataFrame, config: StreamConfig):
    df.createOrReplaceTempView("source")
    merge_condition = " AND ".join([f"target.{k} = source.{k}" for k in config.merge_keys])
    merge_sql = f"""
        MERGE INTO {config.catalog}.{config.schema}.{config.table_name} AS target
        USING source
        ON {merge_condition}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """
    spark.sql(merge_sql)

## 📌 Main Ingestion Function

In [None]:
def main():
    schema = StructType([
        StructField("entity_id", StringType(), True),
        StructField("entity_name", StringType(), True),
        StructField("entity_code", StringType(), True),
        StructField("as_of_date", StringType(), True)
    ])
    config = StreamConfig()
    spark = setup_spark()
    df = spark.readStream.option("header", "true").schema(schema).csv(config.source_path)
    validator = DataValidation(config.merge_keys)
    df_validated = df.filter(lambda d: validator.validate(d))
    query = df_validated.writeStream.format("delta") \
        .outputMode("append") \
        .option("checkpointLocation", config.checkpoint_path) \
        .toTable(f"{config.catalog}.{config.schema}.{config.table_name}")
    query.awaitTermination()

In [None]:
# Uncomment to run
# main()