# INTRODUCTION

In [1]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [2]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
df = spark \
    .readStream \
    .format("json") \
    .load("data/product/")

df.printSchema()

root
 |-- category: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- contains_caffeine: boolean (nullable = true)
 |-- contains_fruit: boolean (nullable = true)
 |-- contains_nuts: boolean (nullable = true)
 |-- contains_veggies: boolean (nullable = true)
 |-- event_time: string (nullable = true)
 |-- item: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- size: string (nullable = true)



In [3]:
from pyspark.sql.functions import current_timestamp, input_file_name

def create_bronze_streaming_table(source, target):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema

    query = (
        spark.readStream
        .format("json")
        .load(source)
        .withColumn("meta_ingestion_ts", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{target}")
        .toTable(target)
    )
    return query

query1 = create_bronze_streaming_table(source="data/inventory", target="bronze_inventory")
query2 = create_bronze_streaming_table(source="data/product", target="bronze_product")
query3 = create_bronze_streaming_table(source="data/purchase", target="bronze_purchase")

# Use the code 
# spark.streams.awaitAnyTermination()


In [4]:
spark.sql("SELECT * FROM bronze_product").show(10)

+--------------------+----+-----------------+--------------+-------------+----------------+--------------------+------------------+-----+----------+------+--------------------+--------------------+
|            category|cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|          event_time|              item|price|product_id|  size|   meta_ingestion_ts|       meta_filename|
+--------------------+----+-----------------+--------------+-------------+----------------+--------------------+------------------+-----+----------+------+--------------------+--------------------+
| Indulgent Smoothies| 2.2|            false|         false|         true|           false|2024-02-17 19:27:...| Peanut Butter Cup| 5.49|      IS02|24 oz.|2024-02-20 20:14:...|file:///home/pete...|
|Superfoods Smoothies| 2.1|            false|          true|        false|           false|2024-02-17 19:32:...|  Acai Berry Boost| 5.99|      SF03|24 oz.|2024-02-20 20:14:...|file:///home/pete...|
| Indulgen

# SILVER TABLES: SLOWLY CHANGING DIMENSIONS (SCD) - TYPE 1 & 2

In [9]:
from pyspark.sql.functions import md5, concat_ws, lit, row_number, monotonically_increasing_id
from pyspark.sql.types import BooleanType, TimestampType
from pyspark.sql.window import Window
from utils import reorder_columns_in_dataframe

def create_silver_scd1_table(
    source : str, 
    target : str,
    timestamp_key : str,
    surrogate_key : str
):

    # Load data and calculate hashdiff string based on all columns that doesn't contain "meta_" in the name
    df = spark.sql(f"select * from {source} order by {timestamp_key}")
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))

    # Set default values for meta_last_updated
    df = df.withColumn("meta_last_updated", current_timestamp())

    # Generate surrogate key
    df = df.withColumn(surrogate_key, monotonically_increasing_id())

    # Reorder columns
    df = reorder_columns_in_dataframe(df=df, 
                                      columns_to_front=[surrogate_key],
                                      columns_to_back=[c for c in df.columns if "meta_" in c],
                                      columns_to_delete=["meta_filename"])
    
    # Create an empty Delta table with the same schema
    tmp_view_name = "temporaryView"
    df.createOrReplaceTempView(tmp_view_name)
    spark.sql(f"CREATE TABLE IF NOT EXISTS {target} LIKE {tmp_view_name} USING DELTA")

    # Merge into target table 
    merge_query = f"""
        MERGE INTO {target} AS target
        USING {tmp_view_name} AS source ON target.{surrogate_key} = source.{surrogate_key}
        WHEN MATCHED AND target.meta_hashdiff <> source.meta_hashdiff THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """
    spark.sql(merge_query).show()


In [10]:

create_silver_scd1_table (
    source="bronze_purchase",
    target="silver_purchase_scd1",
    surrogate_key="transaction_sid",
)

create_silver_scd1_table (
    source="bronze_inventory",
    target="silver_inventory_scd1",
    surrogate_key="inventory_sid",
)

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|              139|               0|               0|              139|
+-----------------+----------------+----------------+-----------------+

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|               35|               0|               0|               35|
+-----------------+----------------+----------------+-----------------+



In [19]:
def silver_scd2_table(source, target, merge_key, timestamp_key, surrogate_key):

    # Load data and calculate hashdiff string based on all columns that doesn't contain "meta_" in the name
    df = spark.sql(f"select * from {source}")
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))

    # Set default values for meta columns
    df = df.withColumn("meta_is_current", lit(1).cast(BooleanType()))
    df = df.withColumn("meta_valid_from", df[timestamp_key])
    df = df.withColumn("meta_valid_to", lit('9999-12-31').cast(TimestampType()))

    # Calculate surrogate key
    df = df.withColumn(surrogate_key, monotonically_increasing_id())

    # Calculate sequence numbers if source data contain multiple rows for each merge_key
    window_spec = Window.partitionBy(merge_key).orderBy(timestamp_key)
    df = df.withColumn("meta_sequence", row_number().over(window_spec))

    # Reorder columns in dataframe
    df = reorder_columns_in_dataframe(
        df=df, 
        columns_to_front=[surrogate_key, merge_key],
        columns_to_back=[c for c in df.columns if "meta_" in c],
        columns_to_delete=["meta_filename"]
    )

    # Create an empty Delta table with the same schema
    tmp_view_name = "temporaryView"
    df.createOrReplaceTempView(tmp_view_name)
    spark.sql(f"CREATE TABLE IF NOT EXISTS {target} LIKE {tmp_view_name} USING DELTA")

    # Get list of sequences
    lst_sequence = sorted([p.meta_sequence for p in df.select('meta_sequence').distinct().collect()])

    # Run SCD2 table 
    for seq_num in lst_sequence:
        merge_query = f"""
            MERGE INTO {target} AS target
            USING (
                SELECT * FROM {tmp_view_name}
                WHERE meta_sequence = {seq_num}
            ) AS source ON target.{merge_key} = source.{merge_key}
            WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff
                THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp_key}
            WHEN NOT MATCHED 
                THEN INSERT *
        """
        spark.sql(merge_query).show()

        insert_query = f"""
            INSERT INTO {target}
            SELECT * FROM 
            (
                SELECT source.* 
                FROM {tmp_view_name} source
                JOIN {target} target ON target.{merge_key} = source.{merge_key}
                WHERE source.meta_sequence = {seq_num}
                AND target.meta_hashdiff <> source.meta_hashdiff 
            )
        """
        spark.sql(insert_query)

In [20]:
# Create SCD2 tables
silver_scd2_table(
    source = "bronze_product",
    target = "silver_product_scd2",
    merge_key = "product_id",
    timestamp_key = "event_time",
    surrogate_key = "product_sid"
)

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|               54|              54|               0|                0|
+-----------------+----------------+----------------+-----------------+



AnalysisException: [DATATYPE_MISMATCH.CAST_WITH_CONF_SUGGESTION] Cannot resolve "meta_is_current" due to data type mismatch: cannot cast "BOOLEAN" to "TIMESTAMP" with ANSI mode on.
If you have to cast "BOOLEAN" to "TIMESTAMP", you can set "spark.sql.storeAssignmentPolicy" as 'LEGACY'.; line 2 pos 12;
'AppendData RelationV2[product_sid#24236L, product_id#24237, category#24238, cogs#24239, contains_caffeine#24240, contains_fruit#24241, contains_nuts#24242, contains_veggies#24243, event_time#24244, item#24245, meta_hashdiff#24246, meta_ingestion_ts#24247, meta_is_current#24248, meta_sequence#24249, meta_valid_from#24250, meta_valid_to#24251, price#24252, size#24253] spark_catalog.default.silver_product_scd2 spark_catalog.default.silver_product_scd2, false
+- 'Project [product_sid#22203L AS product_sid#24255L, product_id#22119 AS product_id#24256, category#22110 AS category#24257, cogs#22111 AS cogs#24258, contains_caffeine#22112 AS contains_caffeine#24259, contains_fruit#22113 AS contains_fruit#24260, contains_nuts#22114 AS contains_nuts#24261, contains_veggies#22115 AS contains_veggies#24262, event_time#22116 AS event_time#24263, item#22117 AS item#24264, cast(meta_sequence#22223 as string) AS meta_hashdiff#24265, cast(price#22118 as timestamp) AS meta_ingestion_ts#24266, cast(size#22120 as boolean) AS meta_is_current#24267, cast(meta_hashdiff#22137 as int) AS meta_sequence#24268, cast(meta_ingestion_ts#22121 as string) AS meta_valid_from#24269, cast(meta_is_current#22152 as timestamp) AS meta_valid_to#24270, cast(meta_valid_from#22168 as double) AS price#24271, cast(meta_valid_to#22185 as string) AS size#24272]
   +- Project [product_sid#22203L, product_id#22119, category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, meta_sequence#22223, price#22118, size#22120, meta_hashdiff#22137, meta_ingestion_ts#22121, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185]
      +- SubqueryAlias __auto_generated_subquery_name
         +- Project [product_sid#22203L, product_id#22119, category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, meta_sequence#22223, price#22118, size#22120, meta_hashdiff#22137, meta_ingestion_ts#22121, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185]
            +- Filter ((meta_sequence#22223 = 1) AND NOT (meta_hashdiff#24228 = meta_hashdiff#22137))
               +- Join Inner, (product_id#24219 = product_id#22119)
                  :- SubqueryAlias source
                  :  +- SubqueryAlias temporaryview
                  :     +- View (`temporaryView`, [product_sid#22203L,product_id#22119,category#22110,cogs#22111,contains_caffeine#22112,contains_fruit#22113,contains_nuts#22114,contains_veggies#22115,event_time#22116,item#22117,meta_sequence#22223,price#22118,size#22120,meta_hashdiff#22137,meta_ingestion_ts#22121,meta_is_current#22152,meta_valid_from#22168,meta_valid_to#22185])
                  :        +- Project [product_sid#22203L, product_id#22119, category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, meta_sequence#22223, price#22118, size#22120, meta_hashdiff#22137, meta_ingestion_ts#22121, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185]
                  :           +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185, product_sid#22203L, meta_sequence#22223]
                  :              +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185, product_sid#22203L, meta_sequence#22223, meta_sequence#22223]
                  :                 +- Window [row_number() windowspecdefinition(product_id#22119, event_time#22116 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS meta_sequence#22223], [product_id#22119], [event_time#22116 ASC NULLS FIRST]
                  :                    +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185, product_sid#22203L]
                  :                       +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, meta_is_current#22152, meta_valid_from#22168, meta_valid_to#22185, monotonically_increasing_id() AS product_sid#22203L]
                  :                          +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, meta_is_current#22152, meta_valid_from#22168, cast(9999-12-31 as timestamp) AS meta_valid_to#22185]
                  :                             +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, meta_is_current#22152, event_time#22116 AS meta_valid_from#22168]
                  :                                +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, meta_hashdiff#22137, cast(1 as boolean) AS meta_is_current#22152]
                  :                                   +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122, md5(cast(concat_ws(||, category#22110, cast(cogs#22111 as string), cast(contains_caffeine#22112 as string), cast(contains_fruit#22113 as string), cast(contains_nuts#22114 as string), cast(contains_veggies#22115 as string), event_time#22116, item#22117, cast(price#22118 as string), product_id#22119, size#22120) as binary)) AS meta_hashdiff#22137]
                  :                                      +- Project [category#22110, cogs#22111, contains_caffeine#22112, contains_fruit#22113, contains_nuts#22114, contains_veggies#22115, event_time#22116, item#22117, price#22118, product_id#22119, size#22120, meta_ingestion_ts#22121, meta_filename#22122]
                  :                                         +- SubqueryAlias spark_catalog.default.bronze_product
                  :                                            +- Relation spark_catalog.default.bronze_product[category#22110,cogs#22111,contains_caffeine#22112,contains_fruit#22113,contains_nuts#22114,contains_veggies#22115,event_time#22116,item#22117,price#22118,product_id#22119,size#22120,meta_ingestion_ts#22121,meta_filename#22122] parquet
                  +- SubqueryAlias target
                     +- SubqueryAlias spark_catalog.default.silver_product_scd2
                        +- Relation spark_catalog.default.silver_product_scd2[product_sid#24218L,product_id#24219,category#24220,cogs#24221,contains_caffeine#24222,contains_fruit#24223,contains_nuts#24224,contains_veggies#24225,event_time#24226,item#24227,meta_hashdiff#24228,meta_ingestion_ts#24229,meta_is_current#24230,meta_sequence#24231,meta_valid_from#24232,meta_valid_to#24233,price#24234,size#24235] parquet


In [18]:
spark.sql("select * from silver_purchase_scd1").show(5)
spark.sql("select * from silver_inventory_scd1").show(5)
spark.sql("select * from silver_product_scd2 order by product_sid").show(5)

+---------------+---------------+---------+---------------+-----+----------+--------+----------------+--------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|transaction_sid|add_supplements|is_member|member_discount|price|product_id|quantity|supplement_price|total_purchase|     transaction_id|    transaction_time|       meta_hashdiff|   meta_ingestion_ts|   meta_last_updated|
+---------------+---------------+---------+---------------+-----+----------+--------+----------------+--------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|              0|           true|    false|            0.0| 4.99|      CS07|       2|            1.99|         13.96|1830085331582317580|2024-02-17 19:27:...|61f75a7278ee6b97a...|2024-02-20 20:14:...|2024-02-20 20:17:...|
|              1|           true|    false|            0.0| 5.99|      SF07|       2|            1.99|         1

In [None]:

def create_gold_dimension_table():


In [33]:
from utils import generate_dim_table_references

def create_gold_fact_table(
    source : str, 
    target : str,
    surrogate_key : str,
    timestamp_key : str,
    dim_table_refs : dict
):

    # Generate and run SQL query
    df = spark.sql(generate_dim_table_references(source, timestamp_key, dim_table_refs))

    # Reorder columns in dataframe
    df = reorder_columns_in_dataframe(
        df=df, 
        columns_to_front=[surrogate_key] + [r["surrogate_key"] for r in dim_table_refs],
        columns_to_back=[c for c in df.columns if "meta_" in c]
    )

    # Create an empty Delta table with the same schema
    tmp_view_name = "temporaryView"
    df.createOrReplaceTempView(tmp_view_name)
    spark.sql(f"CREATE TABLE IF NOT EXISTS {target} LIKE {tmp_view_name} USING DELTA")

    # Merge into target table 
    merge_query = f"""
        MERGE INTO {target} AS target
        USING {tmp_view_name} AS source ON target.{surrogate_key} = source.{surrogate_key}
        WHEN MATCHED AND target.meta_hashdiff <> source.meta_hashdiff THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """

    spark.sql(merge_query).show()
    

In [34]:
create_gold_fact_table (
    source="silver_purchase_scd1",
    target="gold_fact_purchase",
    surrogate_key="transaction_sid",
    timestamp_key="transaction_time",
    dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}]
)

create_gold_fact_table (
    source="silver_inventory_scd1",
    target="gold_fact_inventory",
    surrogate_key="inventory_sid",
    timestamp_key="event_time",
    dim_table_refs=[{"table_name": "silver_product_scd2", "merge_key": "product_id", "surrogate_key": "product_sid"}]
)


SELECT src.*, silver_product_scd2.product_sid 
FROM silver_purchase_scd1 src
LEFT JOIN silver_product_scd2 ON silver_product_scd2.product_id = src.product_id
        AND src.transaction_time BETWEEN silver_product_scd2.meta_valid_from AND silver_product_scd2.meta_valid_to
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                0|               0|               0|                0|
+-----------------+----------------+----------------+-----------------+

SELECT src.*, silver_product_scd2.product_sid 
FROM silver_inventory_scd1 src
LEFT JOIN silver_product_scd2 ON silver_product_scd2.product_id = src.product_id
        AND src.event_time BETWEEN silver_product_scd2.meta_valid_from AND silver_product_scd2.meta_valid_to
+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_upd

In [35]:
spark.sql("select * from gold_fact_purchase").show(5)
spark.sql("select * from gold_fact_inventory").show(5)

+---------------+---------------+---------+---------------+-----+----------+--------+----------------+--------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|transaction_sid|add_supplements|is_member|member_discount|price|product_id|quantity|supplement_price|total_purchase|     transaction_id|    transaction_time|       meta_hashdiff|   meta_ingestion_ts|   meta_last_updated|product_sid|
+---------------+---------------+---------+---------------+-----+----------+--------+----------------+--------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|              0|           true|    false|            0.0| 4.99|      CS07|       2|            1.99|         13.96|1830085331582317580|2024-02-17 19:27:...|61f75a7278ee6b97a...|2024-02-20 20:14:...|2024-02-20 20:17:...|34359738371|
|              1|           true|    false|            0.0| 5.99