# Set Up

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Notebook variables - inherit from pipeline or job
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

# Customer

In [0]:
df_customer_bronze_pyspark = (spark.read
                              .format("json")
                              .option("inferSchema", "true")
                              .load(f"/Volumes/{catalog}/{schema}/customer")
                              .select(
                                  "*",
                                  F.current_timestamp().alias("processing_time"),
                                  "_metadata.file_name"
                              )
)

df_customer_bronze_pyspark.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.customer_bronze_pyspark")

# Orders

In [0]:
df_orders_bronze_pyspark = (spark.read
                              .format("json")
                              .option("inferSchema", "true")
                              .load(f"/Volumes/{catalog}/{schema}/orders")
                              .select(
                                  "*",
                                  F.current_timestamp().alias("processing_time"),
                                  "_metadata.file_name"
                              )
)

df_orders_bronze_pyspark.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.orders_bronze_pyspark")

# Status

In [0]:
df_status_bronze_pyspark = (spark.read
                              .format("json")
                              .option("inferSchema", "true")
                              .load(f"/Volumes/{catalog}/{schema}/status")
                              .select(
                                  "*",
                                  F.current_timestamp().alias("processing_time"),
                                  "_metadata.file_name"
                              )
)

df_status_bronze_pyspark.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.status_bronze_pyspark")

# Status Rank
Creates reference table of status order rankings. Used in creating / updating `gold_current_orders_pyspark` table. The highest ranked status for a given order will be the most current.

In [0]:
data = [['placed', 1],
        ['preparing', 2],
        ['on the way', 3],
        ['delivered', 5],
        ['return requested', 6],
        ['return picked up', 7],
        ['return processed', 8],
        ['return canceled', 9],
        ['canceled', 10],
        ['reported shipping error', 4]]

columns = ['order_status', 'rank']

df_status_ranking = spark.createDataFrame(data, columns)

df_status_ranking.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.order_status_ranking")