# INTRODUCTION TO STREAMING DATAWAREHOUSE

![alt text](images/delta_dwh.png "Data Warehouse")


In [1]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("JAMBA_JUICE") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()


In [2]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
df = spark \
    .readStream \
    .format("json") \
    .load("data/product/")

df.printSchema()

root
 |-- category: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- contains_caffeine: boolean (nullable = true)
 |-- contains_fruit: boolean (nullable = true)
 |-- contains_nuts: boolean (nullable = true)
 |-- contains_veggies: boolean (nullable = true)
 |-- event_time: string (nullable = true)
 |-- item: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- size: string (nullable = true)



# Creating Bronze Tables
This function will create empty tables for future use.

The following meta columns are added to the table:
- meta_created
- meta_filename


In [3]:
from pyspark.sql.functions import current_timestamp, input_file_name

def create_bronze_streaming_table(source, target):

    # Generates a source path based on table name, reads all files from that and inserts into bronze schema
    query = (
        spark.readStream
        .format("json")
        .load(source)
        .withColumn("meta_created", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{target}")
        .toTable(target)
    )
    return query

query1 = create_bronze_streaming_table(source="data/inventory", target="bronze_inventory")
query2 = create_bronze_streaming_table(source="data/product", target="bronze_product")
query3 = create_bronze_streaming_table(source="data/sales", target="bronze_sales")
query4 = create_bronze_streaming_table(source="data/customer", target="bronze_customer")

# Use the code 
# spark.streams.awaitAnyTermination()


In [9]:
spark.sql("SELECT COUNT(*) FROM bronze_sales").show()

+--------+
|count(1)|
+--------+
|     339|
+--------+



# Creating Silver Tables
This function will create empty tables for future use.

Depending on the slowly changing dimension (SCD) type, the following columns will be created:

SCD type 1:
- meta_hashdiff
- meta_last_updated
- meta_sequence

SCD type 2:
- meta_hashdiff
- meta_is_current
- meta_valid_from
- meta_valid_to
- meta_sequence


In [25]:
def create_silver_table(
        table_name : str, 
        surrogate_key : str, 
        source_table : str, 
        scd_type : int
    ):

    # Define table name and surrogate key
    # query = f"CREATE TABLE IF NOT EXISTS {table_name} ({surrogate_key} string,"
    query = f"CREATE OR REPLACE TABLE {table_name} ({surrogate_key} string,"
    
    # Get schema of source table
    source_schema = spark.sql(f"describe table {source_table}").collect()
    for row in source_schema:
        query += f" {row['col_name']} {row['data_type']},"

    # Add extra meta columns depending on SCD (slowly changing dimension) type
    if scd_type == 1:
        query += "meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA"
    elif scd_type == 2:
        query += "meta_hashdiff string, meta_is_current boolean, meta_valid_from timestamp, meta_valid_to timestamp, meta_sequence int) USING DELTA"

    print(query)
    spark.sql(query)

create_silver_table(table_name="silver_sales_scd1", surrogate_key="transaction_sid", source_table="bronze_sales", scd_type=1)
create_silver_table(table_name="silver_inventory_scd1", surrogate_key="inventory_sid", source_table="bronze_inventory", scd_type=1)
create_silver_table(table_name="silver_product_scd2", surrogate_key="product_sid", source_table="bronze_product", scd_type=2)
create_silver_table(table_name="silver_customer_scd2", surrogate_key="customer_sid", source_table="bronze_customer", scd_type=2)


CREATE OR REPLACE TABLE silver_sales_scd1 (transaction_sid string, customer_id bigint, member_discount double, price double, product_id string, quantity bigint, supplement_price double, total_purchase double, transaction_id string, transaction_time string, meta_created timestamp, meta_filename string,meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA
CREATE OR REPLACE TABLE silver_inventory_scd1 (inventory_sid string, event_time string, existing_level bigint, new_level bigint, product_id string, stock_quantity bigint, meta_created timestamp, meta_filename string,meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA
CREATE OR REPLACE TABLE silver_product_scd2 (product_sid string, category string, cogs double, contains_caffeine boolean, contains_fruit boolean, contains_nuts boolean, contains_veggies boolean, event_time string, item string, price double, product_id string, size string, meta_created timestamp, meta_filename string,

In [26]:
spark.sql("describe table silver_product_scd2").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|      product_sid|   string|   NULL|
|         category|   string|   NULL|
|             cogs|   double|   NULL|
|contains_caffeine|  boolean|   NULL|
|   contains_fruit|  boolean|   NULL|
|    contains_nuts|  boolean|   NULL|
| contains_veggies|  boolean|   NULL|
|       event_time|   string|   NULL|
|             item|   string|   NULL|
|            price|   double|   NULL|
|       product_id|   string|   NULL|
|             size|   string|   NULL|
|     meta_created|timestamp|   NULL|
|    meta_filename|   string|   NULL|
|    meta_hashdiff|   string|   NULL|
|  meta_is_current|  boolean|   NULL|
|  meta_valid_from|timestamp|   NULL|
|    meta_valid_to|timestamp|   NULL|
|    meta_sequence|      int|   NULL|
+-----------------+---------+-------+



# Silver Tables: Slowly Changing Dimensions (SCD) Type 1 & 2

In [27]:
from pyspark.sql.functions import md5, concat_ws, lit, row_number
from pyspark.sql.types import BooleanType, TimestampType
from pyspark.sql.window import Window

def create_silver_scd1_table(
    source : str, 
    target : str,
    timestamp_key : str,
    merge_key: str,
    surrogate_key : str,
    delta_load_column: str
):
    
    # Perform delta load
    df = spark.sql(f"""
            SELECT * 
            FROM {source} 
            WHERE {delta_load_column} > (
                SELECT COALESCE(MAX({delta_load_column}), '1970-01-01') FROM {target}
            )
        """)

    # Calculate surrogate key as hash of natural key columns
    df = df.withColumn(surrogate_key, md5(merge_key))

    # Calculate hashdiff string based on all columns that doesn't contain "meta_" in the name
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))

    # Set default values for meta_last_updated
    df = df.withColumn("meta_last_updated", current_timestamp())

    # Calculate sequence numbers if source data contain multiple rows for each merge_key
    window_spec = Window.partitionBy(merge_key).orderBy(timestamp_key)
    df = df.withColumn("meta_sequence", row_number().over(window_spec))
    
    # Create view with source data
    tmp_view_name = "temporaryView"
    df.createOrReplaceTempView(tmp_view_name)

    # Get list of sequences
    lst_sequence = sorted([p.meta_sequence for p in df.select('meta_sequence').distinct().collect()])

    # Run SCD1 table
    for seq_num in lst_sequence:
        print(f"Inserting into SILVER SCD TYPE 1 TABLE: {target}")
        merge_query = f"""
            MERGE INTO {target} AS t
            USING (
                SELECT *
                FROM {tmp_view_name}
                WHERE meta_sequence = {seq_num}
            ) AS s ON t.{surrogate_key} = s.{surrogate_key}
            WHEN MATCHED AND t.meta_hashdiff <> s.meta_hashdiff 
                THEN UPDATE SET *
            WHEN NOT MATCHED 
                THEN INSERT *
        """
        spark.sql(merge_query).show()


# Run code to generate silver_sales_scd1 and silver_inventory_scd1

In [28]:

create_silver_scd1_table (
    source="bronze_sales",
    target="silver_sales_scd1",
    timestamp_key="transaction_time",
    merge_key="transaction_id",
    surrogate_key="transaction_sid",
    delta_load_column="transaction_time"
)

create_silver_scd1_table (
    source="bronze_inventory",
    target="silver_inventory_scd1",
    timestamp_key="event_time",
    merge_key="event_time",
    surrogate_key="inventory_sid",
    delta_load_column="event_time"
)

Inserting into SILVER SCD TYPE 1 TABLE: silver_sales_scd1
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|             1114|               0|               0|             1114|
+-----------------+----------------+----------------+-----------------+

Inserting into SILVER SCD TYPE 1 TABLE: silver_inventory_scd1
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|              219|               0|               0|              219|
+-----------------+----------------+----------------+-----------------+



In [31]:
def create_silver_scd2_table(
    source: str, 
    target: str, 
    merge_key: str, 
    timestamp_key: str, 
    surrogate_key : str,
    delta_load_column: str
):
    
    # Perform delta load
    df = spark.sql(f"""
            SELECT * 
            FROM {source} 
            WHERE {delta_load_column} > (
                SELECT COALESCE(max({delta_load_column}), '1970-01-01') FROM {target}
            )
        """)

    # Calculate surrogate key as hash of natural key columns
    df = df.withColumn(surrogate_key, md5(merge_key))

    # Calculate hashdiff string based on all columns that doesn't contain "meta_" in the name
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))

    # Set default values for meta columns
    df = df.withColumn("meta_is_current", lit(1).cast(BooleanType()))
    df = df.withColumn("meta_valid_from", df[timestamp_key])
    df = df.withColumn("meta_valid_to", lit('9999-12-31').cast(TimestampType()))

    # Calculate sequence numbers if source data contain multiple rows for each merge_key
    window_spec = Window.partitionBy(merge_key).orderBy(timestamp_key)
    df = df.withColumn("meta_sequence", row_number().over(window_spec))

    # Create an empty Delta table with the same schema
    tmp_view_name = "temporaryView"
    df.createOrReplaceTempView(tmp_view_name)

    # Get list of sequences
    lst_sequence = sorted([p.meta_sequence for p in df.select('meta_sequence').distinct().collect()])

    # Run SCD2 table 
    for seq_num in lst_sequence:
        print(f"Inserting into SILVER SCD TYPE 2 TABLE: {target}")
        merge_query = f"""
            MERGE INTO {target} AS t
            USING (
                SELECT * 
                FROM {tmp_view_name}
                WHERE meta_sequence = {seq_num}
            ) AS s ON t.{merge_key} = s.{merge_key}
            WHEN MATCHED AND t.meta_is_current = true AND t.meta_hashdiff <> s.meta_hashdiff
                THEN UPDATE SET meta_is_current = false, meta_valid_to = s.{timestamp_key}
            WHEN NOT MATCHED 
                THEN INSERT *
        """
        spark.sql(merge_query).show()

        insert_query = f"""
            INSERT INTO {target}
            SELECT * 
            FROM 
            (
                SELECT s.*
                FROM {tmp_view_name} s
                JOIN {target} t ON t.{merge_key} = s.{merge_key}
                WHERE s.meta_sequence = {seq_num}
                AND t.meta_hashdiff <> s.meta_hashdiff 
            )
        """
        spark.sql(insert_query)

In [39]:
# Create SCD2 tables
# create_silver_scd2_table(
#     source = "bronze_product",
#     target = "silver_product_scd2",
#     merge_key = "product_id",
#     timestamp_key = "event_time",
#     surrogate_key = "product_sid",
#     delta_load_column="event_time"
# )

create_silver_scd2_table(
    source = "bronze_customer",
    target = "silver_customer_scd2",
    merge_key = "customer_id",
    timestamp_key = "event_time",
    surrogate_key = "customer_sid",
    delta_load_column="event_time"
)

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "md5(customer_id)" due to data type mismatch: Parameter 1 requires the "BINARY" type, however "customer_id" has the type "BIGINT".;
'Project [address#981570, credit_card_expire#981571, credit_card_number#981572, customer_id#981573L, email#981574, event_time#981575, full_name#981576, phone_number#981577, meta_created#981578, meta_filename#981579, md5(customer_id#981573L) AS customer_sid#981610]
+- Project [address#981570, credit_card_expire#981571, credit_card_number#981572, customer_id#981573L, email#981574, event_time#981575, full_name#981576, phone_number#981577, meta_created#981578, meta_filename#981579]
   +- Filter (event_time#981575 > scalar-subquery#981502 [])
      :  +- Aggregate [coalesce(max(event_time#981586), 1970-01-01) AS coalesce(max(event_time), 1970-01-01)#981597]
      :     +- SubqueryAlias spark_catalog.default.silver_customer_scd2
      :        +- Relation spark_catalog.default.silver_customer_scd2[customer_sid#981580,address#981581,credit_card_expire#981582,credit_card_number#981583,customer_id#981584L,email#981585,event_time#981586,full_name#981587,phone_number#981588,meta_created#981589,meta_filename#981590,meta_hashdiff#981591,meta_is_current#981592,meta_valid_from#981593,meta_valid_to#981594,meta_sequence#981595] parquet
      +- SubqueryAlias spark_catalog.default.bronze_customer
         +- Relation spark_catalog.default.bronze_customer[address#981570,credit_card_expire#981571,credit_card_number#981572,customer_id#981573L,email#981574,event_time#981575,full_name#981576,phone_number#981577,meta_created#981578,meta_filename#981579] parquet


In [38]:
spark.sql("select * from bronze_customer order by customer_id").show(5)

# spark.sql("select * from silver_sales_scd1").show(5)
# spark.sql("select * from silver_inventory_scd1").show(5)
# spark.sql("select * from silver_product_scd2 order by product_sid").show(5)
# spark.sql("select * from silver_customer_scd2 order by customer_sid").show(5)


+--------------------+------------------+-------------------+-----------+--------------------+--------------------+-------------+---------------+--------------------+--------------------+
|             address|credit_card_expire| credit_card_number|customer_id|               email|          event_time|    full_name|   phone_number|        meta_created|       meta_filename|
+--------------------+------------------+-------------------+-----------+--------------------+--------------------+-------------+---------------+--------------------+--------------------+
|980 Miller Ford\n...|             05/32|4443642719737473246|          1|larsenrichard@exa...|2024-04-15 19:23:...|Derek Simmons|+1-455-855-3505|2024-04-15 19:26:...|file:///home/pete...|
|980 Miller Ford\n...|             05/32|4443642719737473246|          1|larsenrichard@exa...|2024-04-15 19:33:...|Derek Simmons|+1-455-855-3505|2024-04-15 19:33:...|file:///home/pete...|
|980 Miller Ford\n...|             02/30|   2355423216440553

# CREATE GOLD TABLES - FACTS AND DIMENSIONS

In [40]:

def create_gold_table(
        table_name : str, 
        surrogate_key : str, 
        source_table : str, 
        dim_table_refs : dict
    ):

    # Define table name and surrogate key
    query = f"CREATE TABLE IF NOT EXISTS {table_name} ({surrogate_key} string"

    # Loop through and add surrogate keys for foreign keys
    for row in dim_table_refs:
        query += f", {row['surrogate_key']} string"

    # Get schema of source table
    source_schema = spark.sql(f"describe table {source_table}").collect()
    for row in source_schema:
        if row['col_name'] != surrogate_key:
            query += f", {row['col_name']} {row['data_type']}"

    query += ") USING DELTA;"

    print(query)
    spark.sql(query)
    return


create_gold_table (
    table_name="gold_fact_sales",
    source_table="silver_sales_scd1",
    surrogate_key="transaction_sid",
    dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}]
)

create_gold_table (
    table_name="gold_fact_inventory",
    source_table="silver_inventory_scd1",
    surrogate_key="inventory_sid",
    dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}]
)

CREATE TABLE IF NOT EXISTS gold_fact_sales (transaction_sid string, product_sid string, customer_id bigint, member_discount double, price double, product_id string, quantity bigint, supplement_price double, total_purchase double, transaction_id string, transaction_time string, meta_created timestamp, meta_filename string, meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA;
CREATE TABLE IF NOT EXISTS gold_fact_inventory (inventory_sid string, product_sid string, event_time string, existing_level bigint, new_level bigint, product_id string, stock_quantity bigint, meta_created timestamp, meta_filename string, meta_hashdiff string, meta_last_updated timestamp, meta_sequence int) USING DELTA;


In [42]:
from spark_utils import generate_dim_table_references

def create_gold_fact_table(
    source : str, 
    target : str,
    surrogate_key : str,
    timestamp_key : str,
    dim_table_refs : dict,
    delta_load_column: str
):

    # Generate and run SQL query
    df = spark.sql(generate_dim_table_references(source=source,
                                                 target=target,
                                                 timestamp_key=timestamp_key, 
                                                 dim_table_refs=dim_table_refs, 
                                                 delta_load_column=delta_load_column))

    # Create an empty Delta table with the same schema
    df.createOrReplaceTempView("tempView")

    # Merge into target table 
    merge_query = f"""
        MERGE INTO {target} AS t
        USING tempView AS s
            ON t.{surrogate_key} = s.{surrogate_key}
        WHEN MATCHED AND t.meta_hashdiff <> s.meta_hashdiff 
            THEN UPDATE SET *
        WHEN NOT MATCHED 
            THEN INSERT *
    """
    spark.sql(merge_query).show()
    

In [43]:
create_gold_fact_table (
    source="silver_sales_scd1",
    target="gold_fact_sales",
    surrogate_key="transaction_sid",
    timestamp_key="transaction_time",
    dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}],
    delta_load_column="transaction_time"
)

create_gold_fact_table (
    source="silver_inventory_scd1",
    target="gold_fact_inventory",
    surrogate_key="inventory_sid",
    timestamp_key="event_time",
    dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}],
    delta_load_column="event_time"
)


+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|             1114|               0|               0|             1114|
+-----------------+----------------+----------------+-----------------+

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|              219|               0|               0|              219|
+-----------------+----------------+----------------+-----------------+



In [44]:
# spark.sql("select * from gold_fact_sales").show(5)
spark.sql("select * from gold_fact_inventory").show(5)

+--------------------+--------------------+--------------------+--------------+---------+----------+--------------+--------------------+--------------------+--------------------+--------------------+-------------+
|       inventory_sid|         product_sid|          event_time|existing_level|new_level|product_id|stock_quantity|        meta_created|       meta_filename|       meta_hashdiff|   meta_last_updated|meta_sequence|
+--------------------+--------------------+--------------------+--------------+---------+----------+--------------+--------------------+--------------------+--------------------+--------------------+-------------+
|de6c6db2725bd3f91...|749800f100d37411d...|2024-04-15 19:20:...|            32|       42|      SC01|            10|2024-04-15 19:26:...|file:///home/pete...|5469e1ee7d1f8c6c9...|2024-04-15 19:54:...|            1|
|3caf11f62becf2cc5...|e70b3bf706cd880d8...|2024-04-15 19:20:...|            34|       44|      SC03|            10|2024-04-15 19:26:...|file:///

# PUTTING IT ALL TOGETHER

In [45]:
while True:

    # SILVER SCD TYPE 1 & 2 TABLES
    create_silver_scd1_table (
        source="bronze_sales",
        target="silver_sales_scd1",
        timestamp_key="transaction_time",
        merge_key="transaction_id",
        surrogate_key="transaction_sid",
        delta_load_column="transaction_time"
    )
    create_silver_scd1_table (
        source="bronze_inventory",
        target="silver_inventory_scd1",
        timestamp_key="event_time",
        merge_key="event_time",
        surrogate_key="inventory_sid",
        delta_load_column="event_time"
    )
    create_silver_scd2_table(
        source = "bronze_product",
        target = "silver_product_scd2",
        merge_key = "product_id",
        timestamp_key = "event_time",
        surrogate_key = "product_sid",
        delta_load_column="event_time"
    )

    # GOLD FACT AND DIMENSION TABLES
    create_gold_fact_table (
        source="silver_sales_scd1",
        target="gold_fact_sales",
        surrogate_key="transaction_sid",
        timestamp_key="transaction_time",
        dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}],
        delta_load_column="transaction_time"
    )

    create_gold_fact_table (
        source="silver_inventory_scd1",
        target="gold_fact_inventory",
        surrogate_key="inventory_sid",
        timestamp_key="event_time",
        dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}],
        delta_load_column="event_time"
    )

Inserting into SILVER SCD TYPE 1 TABLE: silver_sales_scd1
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|              636|               0|               0|              636|
+-----------------+----------------+----------------+-----------------+

Inserting into SILVER SCD TYPE 1 TABLE: silver_inventory_scd1
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|              129|               0|               0|              129|
+-----------------+----------------+----------------+-----------------+

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-------------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
spark.sql("select * from silver_product_scd2 order by product_sid").show(15)