# INTRODUCTION

In [None]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/product/")

streaming_df.printSchema()

In [None]:
# Define database schemas - Bronze, Silver, Gold
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze;")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver;")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold;")
spark.sql("USE SCHEMA bronze;")

In [None]:
from pyspark.sql.functions import current_timestamp, input_file_name

def raw_ingestion(schema_name, table_name):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema

    query = (
        spark.readStream
        .format("json")
        .option("maxFilesPerTrigger", 1)
        .load(f"data/{table_name}")
        .withColumn("meta_timestamp", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{schema_name}.{table_name}")
        .toTable(f"{schema_name}.{table_name}")
    )
    return query

query1 = raw_ingestion(schema_name="bronze", table_name="inventory")
query2 = raw_ingestion(schema_name="bronze", table_name="product")
query3 = raw_ingestion(schema_name="bronze", table_name="purchase")

# Use the code 
# spark.streams.awaitAnyTermination()


In [None]:
spark.sql("SELECT * FROM bronze.product").show(15)

In [None]:
spark.sql("SELECT current_schema()").show()
spark.sql("DESCRIBE SCHEMA EXTENDED bronze;").show()
spark.sql("SHOW TABLES IN bronze;").show()

In [None]:
spark.sql("select * from bronze.product").show(5)

# SLOWLY CHANGING DIMENSIONS (SCD) - TYPE 2

In [55]:

from pyspark.sql.functions import md5, concat_ws, lit
from pyspark.sql.types import BooleanType, TimestampType

# Get list of all columns that doesn't contain "meta_"
df = spark.sql("select * from bronze.product")
df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))
df = df.withColumn("meta_is_current", lit(None).cast(BooleanType()))
df = df.withColumn("meta_valid_from", lit(None).cast(TimestampType()))
df = df.withColumn("meta_valid_to", lit(None).cast(TimestampType()))

df.createOrReplaceTempView("tempView")

df.show(5, truncate=False)

spark.sql("CREATE TABLE IF NOT EXISTS bronze.product_new LIKE tempView").show()


+----------------------+----+-----------------+--------------+-------------+----------------+--------------------------+----------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+---------------+-------------+
|category              |cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|event_time                |item            |price|product_id|size  |meta_timestamp         |meta_filename                                        |meta_hashdiff                   |meta_is_current|meta_valid_from|meta_valid_to|
+----------------------+----+-----------------+--------------+-------------+----------------+--------------------------+----------------+-----+----------+------+-----------------------+-----------------------------------------------------+--------------------------------+---------------+---------------+-------------+
|Supercharged Smoothies|2.7 |false         

In [60]:
spark.sql("SHOW COLUMNS FROM bronze.product_new").show()

+--------+----+-----------------+--------------+-------------+----------------+----------+----+-----+----------+----+--------------+-------------+-------------+---------------+---------------+-------------+
|category|cogs|contains_caffeine|contains_fruit|contains_nuts|contains_veggies|event_time|item|price|product_id|size|meta_timestamp|meta_filename|meta_hashdiff|meta_is_current|meta_valid_from|meta_valid_to|
+--------+----+-----------------+--------------+-------------+----------------+----------+----+-----+----------+----+--------------+-------------+-------------+---------------+---------------+-------------+
+--------+----+-----------------+--------------+-------------+----------------+----------+----+-----+----------+----+--------------+-------------+-------------+---------------+---------------+-------------+



In [None]:

source_table = "bronze.product"
target_table = "bronze.product_scd2"
merge_key = "product_id"





merge_query = """
MERGE INTO {target_table}
USING (
  SELECT {source_table}.*
  FROM {source_table} JOIN {target_table}
  ON {source_table}.{merge_key} = {target_table}.{merge_key} 
  WHERE {target_table}.meta_is_current = true {source_table}.meta_hashdiff <> {target_table}.meta_hashdiff 
) staged_updates
ON {target_table}.{merge_key} = staged_updates.{merge_key}
WHEN MATCHED AND {target_table}.meta_is_current = true AND {target_table}.meta_hashdiff <> staged_updates.meta_hashdiff THEN  
  UPDATE SET meta_is_current = false, endDate = staged_updates.effectiveDate
WHEN NOT MATCHED THEN 
  INSERT(customerid, address, current, effectivedate, enddate)
  VALUES(staged_updates.key, staged_updates.address, true, staged_updates.effectiveDate, null)
""".format(
    source_table=source_table,
    target_table=target_table,
    merge_key=merge_key
  )

print(merge_query)

# spark.sql(merge_query)
# display(table("customers").orderBy("customerId"))
