In [0]:
#Read the data from each state
df_ca = spark.read.table("ecommerce.retail_customer.customers_orders_ca_silver")
df_ny = spark.read.table("ecommerce.retail_customer.customers_orders_ny_silver")
df_va = spark.read.table("ecommerce.retail_customer.customers_orders_va_silver")

In [0]:
from pyspark.sql import functions as F

## -- COMMON CLEANING AND STANDARDIZATION --
def clean_common(df):
    df = df.toDF(*[c.strip() for c in df.columns])
    df = df.withColumn("customer_name", F.initcap(F.trim(F.col("customer_name"))))
    
    ''' Correct Code
    df = df.withColumn("customer_name", F.initcap(F.trim(F.col("custmer")))
    '''
    return df

In [0]:
# --CALIFONIA TRANSFORMATION --
def prep_ca(df_ca):
    df = clean_common(df_ca)
    # Fill missing cities with 'Unknown'
    df = df.withColumn("city", F.when(F.col("city").isNull(), F.lit("Unlnown")).otherwise(F.col("city")))
    # Extract order month and large order flag
    df = df.withColumn( "order_month", F.date_format("order_date", "yyyy-MM"))
    # Custom: Add column for Souther CA city marker 
    so_cal_cities = ["Los Angeles", "San Diego", "BBakersfiel", "Anaheim", "Long Beach"]
    df = df.withColumn(
        "is_southern_ca",
        F.lower(F.col("city")).isin([c.lower() for c in so_cal_cities])
    )

    return df


In [0]:
#--NEW YORK TRANSFORMATION--
def prep_ny(df_ny):
    df = clean_common(df_ny)
    # Fill missing cities with "Unspecified"
    df = df.withColumn("city", F.when(F.col("city").isNull(), F.lit("Unspecified")).otherwise(F.col("city")) )
    # Assigning NY region group 
    my_upstate_cities = ["Buffalo", "Rochester", "Albany", "Syracuse"]
    my_downstate_cities =["Yonkers", "White Plains"]

    df = df.withColumn("my_region", (F.when(F.col("city").isin([c for c in my_upstate_cities]), F.lit("Upstate")) 
                                     .when(F.col("city").isin([c for c in my_downstate_cities]), F.lit("Downstate"))
                                     .otherwise(F.lit("Other")) ))
    
    df = df.withColumn("order_week", F.weekofyear("order_date"))

    return df 

In [0]:
#-- VIRGINIA TRANSFORMATION -- 
def prep_va(df_va):
    
    df = clean_common(df_va)
    # Fill missing city as 'Other'
    df = df.withColumn("city", F.when(F.col("city").isNull(), F.lit("Other")).otherwise(F.col("city")))
    # Segment orders by size 
    df = df.withColumn("order_size_label", F.when(F.col("is_large_order") == "true", F.lit("Large")).otherwise(F.lit("Regular")))
    # Extract order day of week 
    df = df.withColumn("order_day_of_week", F.date_format("order_date", "E"))

    return df

In [0]:
# -- APPLY TRANSFORMATION --
df_ca_clean = prep_ca(df_ca)
df_ny_clean = prep_ny(df_ny)
df_va_clean = prep_va(df_va)

# -- Write To tables --
df_ca_clean.write.mode("overwrite").saveAsTable("ecommerce.retail_customer.customers_orders_ca_gold")
df_ny_clean.write.mode("overwrite").saveAsTable("ecommerce.retail_customer.customers_orders_ny_gold")
df_va_clean.write.mode("overwrite").saveAsTable("ecommerce.retail_customer.customers_orders_va_gold")


In [0]:
%sql
SHOW TABLES IN system.lakeflow

In [0]:
%sql
SELECT * FROM system.lakeflow.jobs
