# INTRODUCTION

In [None]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/product/")

streaming_df.printSchema()

In [None]:
# Define database schemas - Bronze, Silver, Gold
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze;")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver;")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold;")
spark.sql("USE SCHEMA bronze;")

In [None]:
from pyspark.sql.functions import current_timestamp, input_file_name

def raw_ingestion(schema_name, table_name):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema

    query = (
        spark.readStream
        .format("json")
        .option("maxFilesPerTrigger", 1)
        .load(f"data/{table_name}")
        .withColumn("meta_timestamp", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{schema_name}.{table_name}")
        .toTable(f"{schema_name}.{table_name}")
    )
    return query

query1 = raw_ingestion(schema_name="bronze", table_name="inventory")
query2 = raw_ingestion(schema_name="bronze", table_name="product")
query3 = raw_ingestion(schema_name="bronze", table_name="purchase")

# Use the code 
# spark.streams.awaitAnyTermination()


In [None]:
spark.sql("SELECT * FROM bronze.product").show(15)

In [None]:
spark.sql("SELECT current_schema()").show()
spark.sql("DESCRIBE SCHEMA EXTENDED bronze;").show()
spark.sql("SHOW TABLES IN bronze;").show()

In [None]:
spark.sql("select * from bronze.product").show(5)

# SLOWLY CHANGING DIMENSIONS (SCD) - TYPE 2

In [82]:

from pyspark.sql.functions import md5, concat_ws, lit
from pyspark.sql.types import BooleanType, TimestampType

# Get list of all columns that doesn't contain "meta_"
df = spark.sql("select * from bronze.product")
df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))
df = df.withColumn("meta_is_current", lit(None).cast(BooleanType()))
df = df.withColumn("meta_valid_from", lit(None).cast(TimestampType()))
df = df.withColumn("meta_valid_to", lit(None).cast(TimestampType()))

df.show(5, truncate=False)

# Create an empty Delta table with the same schema
df.createOrReplaceTempView("tempView")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.product_scd2 LIKE tempView USING DELTA")


+----------------------+----+-----------------+--------------+-------------+----------------+--------------------------+----------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+---------------+-------------+
|category              |cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|event_time                |item            |price|product_id|size  |meta_timestamp         |meta_filename                                        |meta_hashdiff                   |meta_is_current|meta_valid_from|meta_valid_to|
+----------------------+----+-----------------+--------------+-------------+----------------+--------------------------+----------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+---------------+-------------+
|Supercharged Smoothies|2.7 |false         

DataFrame[]

In [81]:
spark.sql("DROP TABLE bronze.product_scd2").show()
# spark.sql("SHOW COLUMNS FROM bronze.product_scd2").show()

++
||
++
++



In [None]:
source_table = "tempView"
target_table = "bronze.product_scd2"
merge_key = "product_id"
timestamp = "event_time"

spark.sql("select * from tempView").show()

# merge_query = """
#     SELECT {source_table}.*
#     FROM {source_table} JOIN {target_table}
#     ON {source_table}.{merge_key} = {target_table}.{merge_key} 
#     WHERE {source_table}.meta_hashdiff <> {target_table}.meta_hashdiff 
# """.format(
#     source_table=source_table,
#     target_table=target_table,
#     merge_key=merge_key,
#     timestamp=timestamp
#   )

# spark.sql(merge_query).show()

In [76]:

source_table = "tempView"
target_table = "bronze.product_scd2"
merge_key = "product_id"
timestamp = "event_time"


# |MERGE INTO numbers_merge_target AS target
# | USING numbers_merge_source AS source ON target.number = source.number
# |WHEN MATCHED THEN UPDATE SET target.number = source.number

merge_query = """
  MERGE INTO {target_table} AS target
  USING {source_table} AS source ON target.{merge_key} = source.{merge_key}
  WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff 
  THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp}
  WHEN NOT MATCHED THEN INSERT
  (
    product_id, 
    category, 
    cogs, 
    contains_caffeine, 
    contains_fruit, 
    contains_nuts, 
    meta_is_current, 
    meta_valid_from, 
    meta_valid_to
  )
  VALUES
  (
    source.product_id, 
    source.category, 
    source.cogs, 
    source.contains_caffeine, 
    source.contains_fruit, 
    source.contains_nuts,
    TRUE,
    source.{timestamp},
    NULL
  )
""".format(
    source_table=source_table,
    target_table=target_table,
    merge_key=merge_key,
    timestamp=timestamp
  )

print(merge_query)

# spark.sql(merge_query)
# display(table("customers").orderBy("customerId"))

spark.sql(merge_query)



  MERGE INTO bronze.product_scd2 AS target
  USING tempView AS source ON target.product_id = source.product_id
  WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff 
  THEN UPDATE SET meta_is_current = false, meta_valid_to = source.event_time
  WHEN NOT MATCHED THEN INSERT
  (
    product_id, 
    category, 
    cogs, 
    contains_caffeine, 
    contains_fruit, 
    contains_nuts, 
    meta_is_current, 
    meta_valid_from, 
    meta_valid_to
  )
  VALUES
  (
    source.product_id, 
    source.category, 
    source.cogs, 
    source.contains_caffeine, 
    source.contains_fruit, 
    source.contains_nuts,
    TRUE,
    source.event_time,
    NULL
  )



UnsupportedOperationException: MERGE INTO TABLE is not supported temporarily.

In [None]:

source_table = "tempView"
target_table = "bronze.product_scd2"
merge_key = "product_id"
timestamp = "event_time"


# |MERGE INTO numbers_merge_target AS target
# | USING numbers_merge_source AS source ON target.number = source.number
# |WHEN MATCHED THEN UPDATE SET target.number = source.number

merge_query = """
  MERGE INTO {target_table}
  USING (
    SELECT {source_table}.*
    FROM {source_table} JOIN {target_table}
    ON {source_table}.{merge_key} = {target_table}.{merge_key} 
    WHERE {target_table}.meta_is_current = true
    AND {source_table}.meta_hashdiff <> {target_table}.meta_hashdiff 
  ) staged_updates
  ON {target_table}.{merge_key} = staged_updates.{merge_key}
  WHEN MATCHED AND {target_table}.meta_is_current = true 
  AND {target_table}.meta_hashdiff <> staged_updates.meta_hashdiff THEN
    UPDATE SET meta_is_current = false, meta_valid_to = staged_updates.{timestamp}
  WHEN NOT MATCHED THEN INSERT
  (
    product_id, 
    category, 
    cogs, 
    contains_caffeine, 
    contains_fruit, 
    contains_nuts, 
    meta_is_current, 
    meta_valid_from, 
    meta_valid_to
  )
  VALUES
  (
    staged_updates.product_id, 
    staged_updates.category, 
    staged_updates.cogs, 
    staged_updates.contains_caffeine, 
    staged_updates.contains_fruit, 
    staged_updates.contains_nuts, 
    TRUE,
    staged_updates.{timestamp}, 
    NULL
  )
""".format(
    source_table=source_table,
    target_table=target_table,
    merge_key=merge_key,
    timestamp=timestamp
  )

print(merge_query)

# spark.sql(merge_query)
# display(table("customers").orderBy("customerId"))

spark.sql(merge_query)
