# INTRODUCTION

In [None]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/product/")

streaming_df.printSchema()

In [None]:
# # Define database schemas - Bronze, Silver, Gold
# spark.sql("CREATE SCHEMA IF NOT EXISTS bronze;")
# spark.sql("CREATE SCHEMA IF NOT EXISTS silver;")
# spark.sql("CREATE SCHEMA IF NOT EXISTS gold;")
# spark.sql("USE SCHEMA bronze;")

In [None]:
from pyspark.sql.functions import current_timestamp, input_file_name

def raw_ingestion(schema_name, table_name):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema

    query = (
        spark.readStream
        .format("json")
        .option("maxFilesPerTrigger", 1)
        .load(f"data/{table_name}")
        .withColumn("meta_timestamp", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{schema_name}_{table_name}")
        .toTable(f"{schema_name}_{table_name}")
    )
    return query

query1 = raw_ingestion(schema_name="bronze", table_name="inventory")
query2 = raw_ingestion(schema_name="bronze", table_name="product")
query3 = raw_ingestion(schema_name="bronze", table_name="purchase")

# Use the code 
# spark.streams.awaitAnyTermination()


In [None]:
spark.sql("SELECT * FROM bronze_product").show(15)

# SLOWLY CHANGING DIMENSIONS (SCD) - TYPE 2

In [83]:

from pyspark.sql.functions import md5, concat_ws, lit, row_number, monotonically_increasing_id
from pyspark.sql.types import BooleanType, TimestampType
from pyspark.sql.window import Window

def silver_scd2_dim_table(source, target, merge_key, timestamp_key, surrogate_column_name):

    # Get list of all columns that doesn't contain "meta_"
    df = spark.sql(f"select * from {source}")
    df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))
    df = df.withColumn("meta_is_current", lit(1).cast(BooleanType()))
    df = df.withColumn("meta_valid_from", df[timestamp_key])
    df = df.withColumn("meta_valid_to", lit('9999-12-31').cast(TimestampType()))

    # Add partition column
    window_spec  = Window.partitionBy(merge_key).orderBy(timestamp_key)
    df = df.withColumn("meta_sequence", row_number().over(window_spec))
    df = df.withColumn(surrogate_column_name, monotonically_increasing_id())

    # Create an empty Delta table with the same schema
    tmp_view_name = "temporaryView"
    df.createOrReplaceTempView(tmp_view_name)
    spark.sql(f"CREATE TABLE IF NOT EXISTS {target} LIKE {tmp_view_name} USING DELTA")

    # Get list of sequences
    lst_sequence = sorted([p.meta_sequence for p in df.select('meta_sequence').distinct().collect()])

    # Run SCD2 table 
    for seq_num in lst_sequence:
        merge_query = f"""
            MERGE INTO {target} AS target
            USING (
                SELECT * FROM {tmp_view_name}
                WHERE meta_sequence = {seq_num}
            ) AS source ON target.{merge_key} = source.{merge_key}
            WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff
                THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp_key}
            WHEN NOT MATCHED 
                THEN INSERT *
        """

        insert_query = f"""
            INSERT INTO {target}
            SELECT * FROM 
            (
                SELECT source.* 
                FROM {tmp_view_name} source
                JOIN {target} target ON target.{merge_key} = source.{merge_key}
                WHERE source.meta_sequence = {seq_num}
                AND target.meta_hashdiff <> source.meta_hashdiff 
            )
        """
        
        # print(merge_query)
        spark.sql(merge_query).show()
        spark.sql(insert_query)

In [84]:
# Create SCD2 tables
spark.sql("DROP TABLE IF EXISTS silver_product_scd2")

silver_scd2_dim_table(
    source = "bronze_product",
    target = "silver_product_scd2",
    merge_key = "product_id",
    timestamp_key = "event_time",
    surrogate_column_name = "product_sid"
)

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|               27|               0|               0|               27|
+-----------------+----------------+----------------+-----------------+

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|               27|              27|               0|                0|
+-----------------+----------------+----------------+-----------------+



In [86]:
spark.sql("select * from silver_product_scd2 order by product_sid").show(10, truncate=False)

+----------------------+----+-----------------+--------------+-------------+----------------+--------------------------+------------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+--------------------------+--------------------------+-------------+-----------+
|category              |cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|event_time                |item              |price|product_id|size  |meta_timestamp         |meta_filename                                        |meta_hashdiff                   |meta_is_current|meta_valid_from           |meta_valid_to             |meta_sequence|product_sid|
+----------------------+----+-----------------+--------------+-------------+----------------+--------------------------+------------------+-----+----------+------+-----------------------+-----------------------------------------------------+-----------------

# Creating Silver Fact Streaming Tables

In [85]:
source="bronze_purchase"
target="silver_purchase"
timestamp_key = "transaction_time"
surrogate_column_name = "transaction_sid"

# if spark.catalog.tableExists(target):
#     df = spark.sql(f"SELECT * FROM {source} WHERE {source}.{timestamp_key} > (SELECT MAX({timestamp_key}) FROM {target})")
# else:
#     df = spark.sql(f"SELECT * FROM {source}")

df = spark.sql(
    f"""
    SELECT source.*, prod.product_sid
    FROM {source} source
    LEFT JOIN silver_product_scd2 prod ON prod.product_id = source.product_id
        AND source.transaction_time BETWEEN prod.meta_valid_from AND prod.meta_valid_to
    """)
df = df.withColumn(surrogate_column_name, monotonically_increasing_id())

df.show(5)


+---------------+---------+---------------+-----+----------+--------+----------------+--------------+-------------------+--------------------+--------------------+--------------------+-----------+---------------+
|add_supplements|is_member|member_discount|price|product_id|quantity|supplement_price|total_purchase|     transaction_id|    transaction_time|      meta_timestamp|       meta_filename|product_sid|transaction_sid|
+---------------+---------+---------------+-----+----------+--------+----------------+--------------+-------------------+--------------------+--------------------+--------------------+-----------+---------------+
|          false|    false|            0.0| 4.99|      CS01|       1|             0.0|          4.99|5926375728195155676|2024-02-17 19:33:...|2024-02-18 09:02:...|file:///home/pete...|          1|              0|
|          false|    false|            0.0| 4.99|      CS07|       1|             0.0|          4.99|3843411071104270264|2024-02-17 19:33:...|2024-0