# Set Up

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import Window

# Notebook variables - inherit from pipeline or job
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

# Customer

In [0]:
# Customer
# Create initial customer silver table, then merge into if exists

df_customer_bronze_pyspark = spark.read.table(f"{catalog}.{schema}.customer_bronze_pyspark")

if not spark.catalog.tableExists(f"{catalog}.{schema}.customers_silver_pyspark"):
     (df_customer_bronze_pyspark
      .select(
          "customer_id",
          "name",
          "email",
          "address",
          "city",
          "state",
          "zip_code",
          F.col("processing_time").alias("last_updated")
      )
      .filter(F.col("file_name") == "00.json")
      .write.saveAsTable(f"{catalog}.{schema}.customers_silver_pyspark"))
     print("Created customer silver table")

# looping through bronze table by file then merging each file separately into the silver table to avoid duplicate customer ids
for file in spark.read.table(f"{catalog}.{schema}.customer_bronze_pyspark").select("file_name").distinct().collect():
    #print(file[0])

    #creating temp table, not actually a view, for all the individual file loads in order to merge without duplicate customer ids
    df_customer_bronze_pyspark.filter(F.col("file_name") == file[0]).write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.temp_vw_bronze_customer_merge")

    #display(spark.read.table(f"{catalog}.{schema}.temp_vw_bronze_customer_merge"))
    
    spark.sql(f'''MERGE INTO {catalog}.{schema}.customers_silver_pyspark target
        USING {catalog}.{schema}.temp_vw_bronze_customer_merge source
        ON target.customer_id = source.customer_id
        WHEN MATCHED AND source.operation = 'DELETE' THEN
        DELETE
        WHEN MATCHED AND source.operation = 'UPDATE' THEN
        UPDATE SET
        target.name = source.name,
        target.email = source.email,
        target.address = source.address,
        target.city = source.city,
        target.state = source.state,
        target.zip_code = source.zip_code,
        target.last_updated = to_timestamp(source.timestamp)
        WHEN NOT MATCHED AND source.operation = 'NEW' THEN
        INSERT (
            target.customer_id,
            target.name,
            target.email,
            target.address,
            target.city,
            target.state,
            target.zip_code,
            target.last_updated
        ) VALUES (
            source.customer_id,
            source.name,
            source.email,
            source.address,
            source.city,
            source.state,
            source.zip_code,
            to_timestamp(source.timestamp)
        )
    ''')

# Drop temp table
spark.sql(f'''DROP TABLE IF EXISTS {catalog}.{schema}.temp_vw_bronze_customer_merge''')

# Orders

In [0]:
# silver orders table
df_orders_silver_pyspark = (spark.read.table(f"{catalog}.{schema}.orders_bronze_pyspark")
                               .select(
                                 "order_id",
                                 "customer_id",
                                 F.col("order_timestamp").cast("timestamp").alias("order_timestamp"),
                                 "notifications"
                               )
)

df_orders_silver_pyspark.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.orders_silver_pyspark")

# Status

In [0]:
# silver status table
df_status_silver_pyspark = (spark.read.table(f"{catalog}.{schema}.status_bronze_pyspark")
                               .select(
                                 "order_id",
                                 "order_status",
                                 F.col("status_timestamp").cast("timestamp").alias("status_timestamp")
                               )
)

df_status_silver_pyspark.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.status_silver_pyspark")