# INTRODUCTION

In [None]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/product/")

streaming_df.printSchema()

In [None]:
# Define database schemas - Bronze, Silver, Gold
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze;")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver;")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold;")
spark.sql("USE SCHEMA bronze;")

In [None]:
from pyspark.sql.functions import current_timestamp, input_file_name

def raw_ingestion(schema_name, table_name):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema

    query = (
        spark.readStream
        .format("json")
        .option("maxFilesPerTrigger", 1)
        .load(f"data/{table_name}")
        .withColumn("meta_timestamp", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{schema_name}.{table_name}")
        .toTable(f"{schema_name}.{table_name}")
    )
    return query

query1 = raw_ingestion(schema_name="bronze", table_name="inventory")
query2 = raw_ingestion(schema_name="bronze", table_name="product")
query3 = raw_ingestion(schema_name="bronze", table_name="purchase")

# Use the code 
# spark.streams.awaitAnyTermination()


In [None]:
spark.sql("SELECT * FROM bronze.product").show(15)

In [None]:
spark.sql("SELECT current_schema()").show()
spark.sql("DESCRIBE SCHEMA EXTENDED bronze;").show()
spark.sql("SHOW TABLES IN bronze;").show()

In [None]:
spark.sql("select * from bronze.product").show(5)

# SLOWLY CHANGING DIMENSIONS (SCD) - TYPE 2

In [98]:

source_table = "tempView"
target_table = "bronze.product_scd2"
merge_key = "product_id"
timestamp = "event_time"

In [120]:

from pyspark.sql.functions import md5, concat_ws, lit, row_number
from pyspark.sql.types import BooleanType, TimestampType
from pyspark.sql.window import Window

# Get list of all columns that doesn't contain "meta_"
df = spark.sql("select * from bronze.product")
df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))
df = df.withColumn("meta_is_current", lit(1).cast(BooleanType()))
df = df.withColumn("meta_valid_from", df[timestamp])
df = df.withColumn("meta_valid_to", lit(None).cast(TimestampType()))

# Create an empty Delta table with the same schema
df.createOrReplaceTempView("tempView")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.product_scd2 LIKE tempView USING DELTA")

# Add partition column
window_spec  = Window.partitionBy("product_id").orderBy("event_time")
df = df.withColumn("row_number", row_number().over(window_spec))
df.createOrReplaceTempView("tempView")

df.show(truncate=False)

+-----------------+----+-----------------+--------------+-------------+----------------+--------------------------+------------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+--------------------------+-------------+----------+
|category         |cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|event_time                |item              |price|product_id|size  |meta_timestamp         |meta_filename                                        |meta_hashdiff                   |meta_is_current|meta_valid_from           |meta_valid_to|row_number|
+-----------------+----+-----------------+--------------+-------------+----------------+--------------------------+------------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+--------------------------+------------

In [121]:
# spark.sql("DROP TABLE bronze.product_scd2").show()
spark.sql("SHOW COLUMNS FROM bronze.product_scd2").show()
spark.sql("select * from bronze.product_scd2").printSchema()

+-----------------+
|         col_name|
+-----------------+
|         category|
|             cogs|
|contains_caffeine|
|   contains_fruit|
|    contains_nuts|
| contains_veggies|
|       event_time|
|             item|
|            price|
|       product_id|
|             size|
|   meta_timestamp|
|    meta_filename|
|    meta_hashdiff|
|  meta_is_current|
|  meta_valid_from|
|    meta_valid_to|
+-----------------+

root
 |-- category: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- contains_caffeine: boolean (nullable = true)
 |-- contains_fruit: boolean (nullable = true)
 |-- contains_nuts: boolean (nullable = true)
 |-- contains_veggies: boolean (nullable = true)
 |-- event_time: string (nullable = true)
 |-- item: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- size: string (nullable = true)
 |-- meta_timestamp: timestamp (nullable = true)
 |-- meta_filename: string (nullable = true)
 |-- meta_hashdiff:

In [117]:
list_of_partitions = [i for i in df.select('row_number').orderBy("row_number").distinct().collect()]

for row_num in [1, 2]:

    merge_query = """
        MERGE INTO {target_table} AS target
        USING {source_table} AS source ON target.{merge_key} = source.{merge_key}
        WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff AND source.row_number = {partition}
        THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp}
        WHEN NOT MATCHED THEN INSERT *
    """.format(
        source_table=source_table,
        target_table=target_table,
        merge_key=merge_key,
        timestamp=timestamp,
        partition=row_num
    )

    print(merge_query)
    spark.sql(merge_query).show()




        MERGE INTO bronze.product_scd2 AS target
        USING tempView AS source ON target.product_id = source.product_id
        WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff AND source.row_number = 1
        THEN UPDATE SET meta_is_current = false, meta_valid_to = source.event_time
        WHEN NOT MATCHED THEN INSERT *
    


UnsupportedOperationException: [DELTA_MULTIPLE_SOURCE_ROW_MATCHING_TARGET_ROW_IN_MERGE] Cannot perform Merge as multiple source rows matched and attempted to modify the same
target row in the Delta table in possibly conflicting ways. By SQL semantics of Merge,
when multiple source rows match on the same target row, the result may be ambiguous
as it is unclear which source row should be used to update or delete the matching
target row. You can preprocess the source table to eliminate the possibility of
multiple matches. Please refer to
https://docs.delta.io/latest/delta-update.html#upsert-into-a-table-using-merge

In [97]:

# "MERGE INTO {target_table}\n",
#     "USING (\n",
#     "  SELECT {source_table}.*\n",
#     "  FROM {source_table} JOIN {target_table}\n",
#     "  ON {source_table}.{merge_key} = {target_table}.{merge_key} \n",
#     "  WHERE {target_table}.meta_is_current = true {source_table}.meta_hashdiff <> {target_table}.meta_hashdiff \n",
#     ") staged_updates\n",
#     "ON {target_table}.{merge_key} = staged_updates.{merge_key}\n",
#     "WHEN MATCHED AND {target_table}.meta_is_current = true AND {target_table}.meta_hashdiff <> staged_updates.meta_hashdiff THEN  \n",
#     "  UPDATE SET meta_is_current = false, endDate = staged_updates.effectiveDate\n",
#     "WHEN NOT MATCHED THEN \n",
#     "  INSERT(customerid, address, current, effectivedate, enddate)\n",
#     "  VALUES(staged_updates.key, staged_updates.address, true, staged_updates.effectiveDate, null)\n",

merge_query = """
  MERGE INTO {target_table} AS target
  USING {source_table} AS source ON target.{merge_key} = source.{merge_key}
  WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff 
  THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp}
  WHEN NOT MATCHED THEN INSERT *
""".format(
    source_table=source_table,
    target_table=target_table,
    merge_key=merge_key,
    timestamp=timestamp
  )

# merge_query = """
#   MERGE INTO {target_table} AS target
#   USING {source_table} AS source ON target.{merge_key} = source.{merge_key}
#   WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff 
#   THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp}
#   WHEN NOT MATCHED THEN INSERT
#   (
#     product_id, 
#     category, 
#     cogs, 
#     contains_caffeine, 
#     contains_fruit, 
#     contains_nuts, 
#     meta_hashdiff,
#     meta_is_current, 
#     meta_valid_from, 
#     meta_valid_to
#   )
#   VALUES
#   (
#     source.product_id, 
#     source.category, 
#     source.cogs, 
#     source.contains_caffeine, 
#     source.contains_fruit, 
#     source.contains_nuts,
#     meta_hashdiff,
#     TRUE,
#     source.{timestamp},
#     NULL
#   )
# """.format(
#     source_table=source_table,
#     target_table=target_table,
#     merge_key=merge_key,
#     timestamp=timestamp
#   )

spark.sql(merge_query)


NameError: name 'Window' is not defined

In [95]:
spark.sql("select * from bronze.product_scd2").show(truncate=False)

+----------------------+----+-----------------+--------------+-------------+----------------+----------+----+-----+----------+----+--------------+-------------+--------------------------------+---------------+--------------------------+-------------+
|category              |cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|event_time|item|price|product_id|size|meta_timestamp|meta_filename|meta_hashdiff                   |meta_is_current|meta_valid_from           |meta_valid_to|
+----------------------+----+-----------------+--------------+-------------+----------------+----------+----+-----+----------+----+--------------+-------------+--------------------------------+---------------+--------------------------+-------------+
|Indulgent Smoothies   |2.2 |true             |false         |true         |NULL            |NULL      |NULL|NULL |IS04      |NULL|NULL          |NULL         |181414610d136a3117263bf8e468104d|true           |2024-02-17 19:27:38.294438|NULL       