In [0]:
import pandas as pd

In [0]:
 ### ---Data Cleaning ---

#Strip whitespace from column names
df = spark.sql("select * from ecommerce.retail_customer.customers_sales_silver")
df = df.toDF(*[col.strip() for col in df.columns])

In [0]:
from pyspark.sql.functions import trim, initcap, col, to_date

#Standardize customer names to title case and strip whitespace
df = df.withColumn("customer_name", initcap(trim(col("customer_name"))))

#Convert data types for IDs, numeric columns and dates
df = (df.withColumn("customer_id", col("customer_id").cast("long"))
       .withColumn("units_purchased", col("units_purchased").cast("double"))
       .withColumn("total_price", col("total_price").cast("double")))

# Drop rows with missing values after conversions 
df = df.dropna(subset=["customer_id", "units_purchased", "total_price","order_date"])

In [0]:
from pyspark.sql.functions import year, month, date_format, round as spark_round

df = (df.withColumn("order_year", year(col("order_date")))
       .withColumn("order_month", date_format(col("order_date"), "MM")) )

df = df.withColumn("avg_price_per_unit", spark_round(col("total_price")/col("units_purchased"),2))


In [0]:
display(df)

In [0]:
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("ecommerce.retail_customer.customers_sales_gold")