# INTRODUCTION

In [None]:

# Import SparkSession
import pyspark
from delta import configure_spark_with_delta_pip

builder = pyspark.sql.SparkSession.builder.appName("STREAMING_DWH") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = spark \
    .readStream \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .load("data/product/")

streaming_df.printSchema()

In [None]:
# Define database schemas - Bronze, Silver, Gold
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze;")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver;")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold;")
spark.sql("USE SCHEMA bronze;")

In [None]:
from pyspark.sql.functions import current_timestamp, input_file_name

def raw_ingestion(schema_name, table_name):
    # Generates a source path based on table name, reads all files from that and inserts into bronze schema

    query = (
        spark.readStream
        .format("json")
        .option("maxFilesPerTrigger", 1)
        .load(f"data/{table_name}")
        .withColumn("meta_timestamp", current_timestamp())
        .withColumn("meta_filename", input_file_name())
        .writeStream
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", f"spark-warehouse/_checkpoints/{schema_name}.{table_name}")
        .toTable(f"{schema_name}.{table_name}")
    )
    return query

query1 = raw_ingestion(schema_name="bronze", table_name="inventory")
query2 = raw_ingestion(schema_name="bronze", table_name="product")
query3 = raw_ingestion(schema_name="bronze", table_name="purchase")

# Use the code 
# spark.streams.awaitAnyTermination()


In [None]:
spark.sql("SELECT * FROM bronze.product").show(15)

In [None]:
spark.sql("SELECT current_schema()").show()
spark.sql("DESCRIBE SCHEMA EXTENDED bronze;").show()
spark.sql("SHOW TABLES IN bronze;").show()

In [None]:
spark.sql("select * from bronze.product").show(5)

# SLOWLY CHANGING DIMENSIONS (SCD) - TYPE 2

In [None]:

source_table = "tempView"
target_table = "bronze.product_scd2"
merge_key = "product_id"
timestamp = "event_time"

In [None]:

from pyspark.sql.functions import md5, concat_ws, lit, row_number
from pyspark.sql.types import BooleanType, TimestampType
from pyspark.sql.window import Window

# Get list of all columns that doesn't contain "meta_"
df = spark.sql("select * from bronze.product")
df = df.withColumn("meta_hashdiff", md5(concat_ws("||", *[c for c in df.columns if "meta_" not in c])))
df = df.withColumn("meta_is_current", lit(1).cast(BooleanType()))
df = df.withColumn("meta_valid_from", df[timestamp])
df = df.withColumn("meta_valid_to", lit(None).cast(TimestampType()))

# Create an empty Delta table with the same schema
df.createOrReplaceTempView("tempView")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.product_scd2 LIKE tempView USING DELTA")

# Add partition column
window_spec  = Window.partitionBy("product_id").orderBy("event_time")
df = df.withColumn("row_number", row_number().over(window_spec))
df.createOrReplaceTempView("tempView")

df.show(truncate=False)

In [None]:
# spark.sql("DROP TABLE bronze.product_scd2").show()
spark.sql("SHOW COLUMNS FROM bronze.product_scd2").show()
spark.sql("select * from bronze.product_scd2").printSchema()

In [None]:
list_of_partitions = [i for i in df.select('row_number').orderBy("row_number").distinct().collect()]

for row_num in [1, 2]:

    merge_query = """
        MERGE INTO {target_table} AS target
        USING (
            SELECT * FROM {source_table}
            WHERE row_number = {partition}
        ) AS source ON target.{merge_key} = source.{merge_key}
        WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff
        THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp}
        WHEN NOT MATCHED BY target THEN INSERT *
    """.format(
        source_table=source_table,
        target_table=target_table,
        merge_key=merge_key,
        timestamp=timestamp,
        partition=row_num
    )

    print(merge_query)
    spark.sql(merge_query).show()



In [None]:
spark.sql("select * from bronze.product_scd2 where meta_is_current = true").show(100, truncate=False)

In [None]:


merge_query = """
  MERGE INTO {target_table} AS target
  USING {source_table} AS source ON target.{merge_key} = source.{merge_key}
  WHEN MATCHED AND target.meta_is_current = true AND target.meta_hashdiff <> source.meta_hashdiff 
  THEN UPDATE SET meta_is_current = false, meta_valid_to = source.{timestamp}
  WHEN NOT MATCHED THEN INSERT *
""".format(
    source_table=source_table,
    target_table=target_table,
    merge_key=merge_key,
    timestamp=timestamp
  )

spark.sql(merge_query)


In [None]:
spark.sql("select * from bronze.product_scd2").show(truncate=False)