# INTRODUCTION TO STREAMING DATAWAREHOUSE

![alt text](images/delta_dwh.png "Data Warehouse")


In [1]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("JAMBA_JUICE") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()


In [2]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
df = spark \
    .readStream \
    .format("json") \
    .load("data/product/")

df.printSchema()

root
 |-- category: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- contains_caffeine: boolean (nullable = true)
 |-- contains_fruit: boolean (nullable = true)
 |-- contains_nuts: boolean (nullable = true)
 |-- contains_veggies: boolean (nullable = true)
 |-- event_time: string (nullable = true)
 |-- item: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- size: string (nullable = true)



# Creating Bronze Tables
This function will create empty tables for future use.

The following meta columns are added to the table:
- meta_created
- meta_filename


In [3]:
from pyspark.sql.functions import current_timestamp, input_file_name

def create_bronze_streaming_table(source, target):

    # Generates a source path based on table name, reads all files from that and inserts into bronze schema
    query = (
        spark.readStream
        .format("json")
        .load(source)
        .withColumn("meta_created", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .trigger(processingTime='10 seconds')
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{target}")
        .toTable(target)
    )
    return query

bronze_query1 = create_bronze_streaming_table(source="data/inventory", target="bronze_inventory")
bronze_query2 = create_bronze_streaming_table(source="data/product", target="bronze_product")
bronze_query3 = create_bronze_streaming_table(source="data/sales", target="bronze_sales")
bronze_query4 = create_bronze_streaming_table(source="data/customer", target="bronze_customer")


In [4]:
spark.sql("SELECT COUNT(*) FROM bronze_sales").show()

+--------+
|count(1)|
+--------+
|       5|
+--------+



# Creating Silver Tables
This function will create empty tables for future use.

Depending on the slowly changing dimension (SCD) type, the following columns will be created:

SCD type 1:
- meta_hashdiff
- meta_last_updated
- meta_sequence

SCD type 2:
- meta_hashdiff
- meta_is_current
- meta_valid_from
- meta_valid_to
- meta_sequence


In [5]:
def create_silver_table_schema(
        table_name : str, 
        surrogate_key : str, 
        source_table : str, 
        scd_type : int
    ):

    # Define table name and surrogate key
    query = f"CREATE TABLE IF NOT EXISTS {table_name} ({surrogate_key} string,"
    
    # Get schema of source table
    source_schema = spark.sql(f"describe table {source_table}").collect()
    for row in source_schema:
        query += f" {row['col_name']} {row['data_type']},"

    # Add extra meta columns depending on SCD (slowly changing dimension) type
    if scd_type == 1:
        query += "meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA"
    elif scd_type == 2:
        query += "meta_hashdiff string, meta_is_current boolean, meta_valid_from timestamp, meta_valid_to timestamp, meta_sequence int) USING DELTA"

    # Run and print SQL query
    spark.sql(query)
    print(query)

create_silver_table_schema(table_name="silver_sales_scd1", surrogate_key="transaction_sid", source_table="bronze_sales", scd_type=1)
create_silver_table_schema(table_name="silver_inventory_scd1", surrogate_key="inventory_sid", source_table="bronze_inventory", scd_type=1)
create_silver_table_schema(table_name="silver_product_scd2", surrogate_key="product_sid", source_table="bronze_product", scd_type=2)
create_silver_table_schema(table_name="silver_customer_scd2", surrogate_key="customer_sid", source_table="bronze_customer", scd_type=2)


CREATE TABLE IF NOT EXISTS silver_sales_scd1 (transaction_sid string, customer_id bigint, member_discount double, price double, product_id string, quantity bigint, supplement_price double, total_purchase double, transaction_id string, transaction_time string, meta_created timestamp, meta_filename string,meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA
CREATE TABLE IF NOT EXISTS silver_inventory_scd1 (inventory_sid string, event_time string, existing_level bigint, new_level bigint, product_id string, stock_quantity bigint, meta_created timestamp, meta_filename string,meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA
CREATE TABLE IF NOT EXISTS silver_product_scd2 (product_sid string, category string, cogs double, contains_caffeine boolean, contains_fruit boolean, contains_nuts boolean, contains_veggies boolean, event_time string, item string, price double, product_id string, size string, meta_created timestamp, meta_filenam

In [6]:
spark.sql("describe table silver_sales_scd1").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|  transaction_sid|   string|   NULL|
|      customer_id|   bigint|   NULL|
|  member_discount|   double|   NULL|
|            price|   double|   NULL|
|       product_id|   string|   NULL|
|         quantity|   bigint|   NULL|
| supplement_price|   double|   NULL|
|   total_purchase|   double|   NULL|
|   transaction_id|   string|   NULL|
| transaction_time|   string|   NULL|
|     meta_created|timestamp|   NULL|
|    meta_filename|   string|   NULL|
|    meta_hashdiff|   string|   NULL|
|meta_last_updated|timestamp|   NULL|
|    meta_sequence|      int|   NULL|
+-----------------+---------+-------+



# Silver Tables: Slowly Changing Dimensions (SCD) Type 1 & 2

In [36]:
from pyspark.sql.functions import md5, concat_ws, lit, row_number, column
from pyspark.sql.types import BooleanType, TimestampType, BinaryType
from pyspark.sql.window import Window
from pyspark.sql import DataFrame

def upsert_to_scd1_table(df: DataFrame, id: int):
    
    # Get input parameters from columns
    # TODO: Find a better way to pass input parameters to functions 
    target = df.select("parameter_target").limit(1).collect()[0][0]
    timestamp_key = df.select("parameter_timestamp_key").limit(1).collect()[0][0]
    join_key = df.select("parameter_join_key").limit(1).collect()[0][0]
    df = df.drop("parameter_target")
    df = df.drop("parameter_timestamp_key")
    df = df.drop("parameter_join_key")

    # Calculate hashdiff
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))

    # Calculate sequence number
    df = df.withColumn("meta_sequence", row_number().over(Window.partitionBy(join_key).orderBy(timestamp_key)))

    # Create view with source data
    df.createOrReplaceTempView("tempView")

    # Get list of sequences
    lst_sequence = sorted([p.meta_sequence for p in df.select('meta_sequence').distinct().collect()])

    # Run SCD1 table
    for seq_num in lst_sequence:
        query = f"""
            MERGE INTO {target} AS t
            USING (
                SELECT *
                FROM tempView
                WHERE meta_sequence = {seq_num}
            ) AS s ON t.{join_key} = s.{join_key}
            WHEN MATCHED AND t.meta_hashdiff <> s.meta_hashdiff 
                THEN UPDATE SET *
            WHEN NOT MATCHED 
                THEN INSERT *
        """
        df.sparkSession.sql(query)


def create_silver_streaming_table(
    source : str, 
    target : str,
    timestamp_key : str,
    join_key: str,
    surrogate_key : str
):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema
    query = (
        spark.readStream
        .table(source)
        .withColumn(surrogate_key, md5(column(join_key).cast(BinaryType())))
        .withColumn("meta_last_updated", current_timestamp())
        .withColumn("parameter_target", lit(target))
        .withColumn("parameter_timestamp_key", lit(timestamp_key))
        .withColumn("parameter_join_key", lit(join_key))
        .writeStream
        .format("delta")
        .foreachBatch(upsert_to_scd1_table)
        .outputMode("update")
        .start()
    )
    return query

In [37]:
silver_query1 = create_silver_streaming_table (
    source="bronze_sales",
    target="silver_sales_scd1",
    timestamp_key="transaction_time",
    join_key="transaction_id",
    surrogate_key="transaction_sid"
)

silver_query2 = create_silver_streaming_table (
    source="bronze_inventory",
    target="silver_inventory_scd1",
    timestamp_key="event_time",
    join_key="event_time",
    surrogate_key="inventory_sid"
)

In [21]:
spark.sql("select count(*) as sales from silver_sales_scd1").show()

spark.sql("select count(*) as inventory from silver_inventory_scd1").show()

+-----+
|sales|
+-----+
|   13|
+-----+

+---------+
|inventory|
+---------+
|        4|
+---------+



In [46]:
from pyspark.sql.functions import md5, concat_ws, lit, row_number, column
from pyspark.sql.types import BooleanType, TimestampType, BinaryType
from pyspark.sql.window import Window
from pyspark.sql import DataFrame

def upsert_to_scd2_table(df: DataFrame, id: int):
    
    # Get input parameters from columns
    target = df.select("parameter_target").limit(1).collect()[0][0]
    timestamp_key = df.select("parameter_timestamp_key").limit(1).collect()[0][0]
    join_key = df.select("parameter_join_key").limit(1).collect()[0][0]

    # Drop extra columns
    df = df.drop("parameter_target")
    df = df.drop("parameter_timestamp_key")
    df = df.drop("parameter_join_key")

    # Calculate hashdiff
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))

    # Set default values for meta columns
    df = df.withColumn("meta_valid_from", df[timestamp_key])
    df = df.withColumn("meta_valid_to", lit('9999-12-31').cast(TimestampType()))

    # Calculate sequence number
    df = df.withColumn("meta_sequence", row_number().over(Window.partitionBy(join_key).orderBy(timestamp_key)))

    # Reorder dataframe to have same order as target table (otherwise insert statement might fail)
    df_target = spark.read.table(target).limit(1)
    df = df.select(df_target.columns)

    # Create view with source data
    df.createOrReplaceTempView("tempView")

    # Get list of sequences
    lst_sequence = sorted([p.meta_sequence for p in df.select('meta_sequence').distinct().collect()])

    # Run SCD2 table 
    for seq_num in lst_sequence:
        merge_query = f"""
            MERGE INTO {target} AS t
            USING (
                SELECT * 
                FROM tempView
                WHERE meta_sequence = {seq_num}
            ) AS s ON t.{join_key} = s.{join_key}
            WHEN MATCHED AND t.meta_is_current = true AND t.meta_hashdiff <> s.meta_hashdiff
                THEN UPDATE SET meta_is_current = false, meta_valid_to = s.{timestamp_key}
            WHEN NOT MATCHED 
                THEN INSERT *
        """
        df.sparkSession.sql(merge_query)

        insert_query = f"""
            INSERT INTO {target}
            SELECT s.*
            FROM tempView s
            JOIN {target} t ON t.{join_key} = s.{join_key}
            WHERE s.meta_sequence = {seq_num}
            AND t.meta_hashdiff <> s.meta_hashdiff
        """
        df.sparkSession.sql(insert_query)


def create_silver_scd2_streaming_table(
    source : str, 
    target : str,
    timestamp_key : str,
    join_key: str,
    surrogate_key : str
):

    # TODO: Find a better way to pass arguments to the foreachBatch function!!!

    # Generates a source path based on table name, reads all files from that and inserts into bronze schema
    query = (
        spark.readStream
        .table(source)
        .withColumn(surrogate_key, md5(column(join_key).cast(BinaryType())))
        .withColumn("meta_is_current", lit(1).cast(BooleanType()))
        .withColumn("parameter_target", lit(target))
        .withColumn("parameter_timestamp_key", lit(timestamp_key))
        .withColumn("parameter_join_key", lit(join_key))
        .writeStream
        .format("delta")
        .foreachBatch(upsert_to_scd2_table)
        .outputMode("update")
        .start()
    )
    return query


In [47]:
# Create SCD2 tables
silver_query3 = create_silver_scd2_streaming_table(
    source = "bronze_product",
    target = "silver_product_scd2",
    join_key = "product_id",
    timestamp_key = "event_time",
    surrogate_key = "product_sid"
)

silver_query4 = create_silver_scd2_streaming_table(
    source = "bronze_customer",
    target = "silver_customer_scd2",
    join_key = "customer_id",
    timestamp_key = "event_time",
    surrogate_key = "customer_sid"
)

In [49]:
spark.sql("select count(*) as products from silver_product_scd2").show()
spark.sql("select count(*) as customers from silver_customer_scd2").show()

+--------+
|products|
+--------+
|      27|
+--------+

+---------+
|customers|
+---------+
|      119|
+---------+



In [13]:
spark.sql("select cast('hello' as binary)").show()

+---------------------+
|CAST(hello AS BINARY)|
+---------------------+
|     [68 65 6C 6C 6F]|
+---------------------+



In [14]:
spark.sql("select * from bronze_customer order by customer_id").show(5)

# spark.sql("select * from silver_sales_scd1").show(5)
# spark.sql("select * from silver_inventory_scd1").show(5)
# spark.sql("select * from silver_product_scd2 order by product_sid").show(5)
# spark.sql("select * from silver_customer_scd2 order by customer_sid").show(5)


+--------------------+------------------+------------------+-----------+--------------------+--------------------+---------------+---------------+--------------------+--------------------+
|             address|credit_card_expire|credit_card_number|customer_id|               email|          event_time|      full_name|   phone_number|        meta_created|       meta_filename|
+--------------------+------------------+------------------+-----------+--------------------+--------------------+---------------+---------------+--------------------+--------------------+
|35203 Smith Junct...|             10/24|     4221406063287|          1|heather26@example...|2024-04-17 16:47:...|  William Moore|   455.490.6008|2024-04-17 16:47:...|file:///home/pete...|
|6386 Madison Road...|             10/31|  4749505413855012|          2|gdougherty@exampl...|2024-04-17 16:47:...|Jacqueline Boyd|+1-426-857-6347|2024-04-17 16:48:...|file:///home/pete...|
|6386 Madison Road...|             10/31|  474950541385

# CREATE GOLD TABLES - FACTS AND DIMENSIONS

In [15]:

def create_gold_table_schema(
        table_name : str, 
        surrogate_key : str, 
        source_table : str, 
        dim_table_refs : dict
    ):

    # Define table name and surrogate key
    query = f"CREATE TABLE IF NOT EXISTS {table_name} ({surrogate_key} string"

    # Loop through and add surrogate keys for foreign keys
    for row in dim_table_refs:
        query += f", {row['surrogate_key']} string"

    # Get schema of source table
    source_schema = spark.sql(f"describe table {source_table}").collect()
    for row in source_schema:
        if row['col_name'] != surrogate_key:
            query += f", {row['col_name']} {row['data_type']}"

    query += ") USING DELTA;"

    print(query)
    spark.sql(query)

In [16]:
create_gold_table_schema (
    table_name="gold_fact_sales",
    source_table="silver_sales_scd1",
    surrogate_key="transaction_sid",
    dim_table_refs=[{"table_name": "silver_product_scd2", "join_key": "product_id", "surrogate_key": "product_sid"}]
)

create_gold_table_schema (
    table_name="gold_fact_inventory",
    source_table="silver_inventory_scd1",
    surrogate_key="inventory_sid",
    dim_table_refs=[{"table_name": "silver_product_scd2", "join_key": "product_id", "surrogate_key": "product_sid"}]
)

CREATE TABLE IF NOT EXISTS gold_fact_sales (transaction_sid string, product_sid string, customer_id bigint, member_discount double, price double, product_id string, quantity bigint, supplement_price double, total_purchase double, transaction_id string, transaction_time string, meta_created timestamp, meta_filename string, meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA;
CREATE TABLE IF NOT EXISTS gold_fact_inventory (inventory_sid string, product_sid string, event_time string, existing_level bigint, new_level bigint, product_id string, stock_quantity bigint, meta_created timestamp, meta_filename string, meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA;


In [17]:
def generate_dim_table_references(source, target, timestamp_key, dim_table_refs, delta_load_column, print_output=True):
    
    query_first = "SELECT s.*"
    query_last = f"\nFROM {source} s"

    for ref in dim_table_refs:
        
        # Construct first part of query: Selects
        query_first += f""", {ref["table_name"]}.{ref["surrogate_key"]} """

        # Construct last part of query: Joins
        query_last += f"""\nLEFT JOIN {ref["table_name"]} ON {ref["table_name"]}.{ref["join_key"]} = s.{ref["join_key"]}
        AND s.{timestamp_key} BETWEEN {ref["table_name"]}.meta_valid_from AND {ref["table_name"]}.meta_valid_to"""

    # Add delta load logic if the target table already exists
    if delta_load_column:
        query_last += f"\n WHERE s.{delta_load_column} > (SELECT COALESCE(MAX({delta_load_column}), '1970-01-01') FROM {target})"
    
    # Print output
    if print_output:
        print(query_first + query_last)

    return query_first + query_last

In [18]:
def create_gold_fact_table(
    source : str, 
    target : str,
    surrogate_key : str,
    timestamp_key : str,
    dim_table_refs : dict,
    delta_load_column: str
):

    # Generate and run SQL query
    df = spark.sql(generate_dim_table_references(source=source,
                                                 target=target,
                                                 timestamp_key=timestamp_key, 
                                                 dim_table_refs=dim_table_refs, 
                                                 delta_load_column=delta_load_column))

    # Create an empty Delta table with the same schema
    df.createOrReplaceTempView("tempView")

    # Merge into target table 
    merge_query = f"""
        MERGE INTO {target} AS t
        USING tempView AS s
            ON t.{surrogate_key} = s.{surrogate_key}
        WHEN MATCHED AND t.meta_hashdiff <> s.meta_hashdiff 
            THEN UPDATE SET *
        WHEN NOT MATCHED 
            THEN INSERT *
    """
    spark.sql(merge_query).show()
    

In [19]:
create_gold_fact_table (
    source="silver_sales_scd1",
    target="gold_fact_sales",
    surrogate_key="transaction_sid",
    timestamp_key="transaction_time",
    dim_table_refs=[
        {"table_name": "silver_product_scd2", "join_key": "product_id", "surrogate_key": "product_sid"},
        {"table_name": "silver_customer_scd2", "join_key": "customer_id", "surrogate_key": "customer_sid"},
    ],
    delta_load_column="transaction_time"
)

create_gold_fact_table (
    source="silver_inventory_scd1",
    target="gold_fact_inventory",
    surrogate_key="inventory_sid",
    timestamp_key="event_time",
    dim_table_refs=[{"table_name": "silver_product_scd2", "join_key": "product_id", "surrogate_key": "product_sid"}],
    delta_load_column="event_time"
)


SELECT s.*, silver_product_scd2.product_sid , silver_customer_scd2.customer_sid 
FROM silver_sales_scd1 s
LEFT JOIN silver_product_scd2 ON silver_product_scd2.product_id = s.product_id
        AND s.transaction_time BETWEEN silver_product_scd2.meta_valid_from AND silver_product_scd2.meta_valid_to
LEFT JOIN silver_customer_scd2 ON silver_customer_scd2.customer_id = s.customer_id
        AND s.transaction_time BETWEEN silver_customer_scd2.meta_valid_from AND silver_customer_scd2.meta_valid_to
 WHERE s.transaction_time > (SELECT COALESCE(MAX(transaction_time), '1970-01-01') FROM gold_fact_sales)
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                7|               0|               0|                7|
+-----------------+----------------+----------------+-----------------+

SELECT s.*, silver_product_scd2.product

In [20]:
# spark.sql("select * from gold_fact_sales").show(5)
spark.sql("select * from gold_fact_inventory").show(5)

+--------------------+--------------------+--------------------+--------------+---------+----------+--------------+--------------------+--------------------+--------------------+--------------------+-------------+
|       inventory_sid|         product_sid|          event_time|existing_level|new_level|product_id|stock_quantity|        meta_created|       meta_filename|       meta_hashdiff|   meta_last_updated|meta_sequence|
+--------------------+--------------------+--------------------+--------------+---------+----------+--------------+--------------------+--------------------+--------------------+--------------------+-------------+
|6e5cf081b2e954385...|b19c2e29b9abe277f...|2024-04-17 16:47:...|            48|       58|      SF07|            10|2024-04-17 16:47:...|file:///home/pete...|caa1c16da83e5b267...|2024-04-17 16:47:...|            1|
|1b4639fe3e1668279...|8d73dd95531fed18d...|2024-04-17 16:47:...|            33|       43|      SC02|            10|2024-04-17 16:48:...|file:///