In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_date, coalesce, trim

#Silver

##reading from bronze

In [0]:
df_bronze = spark.sql(f"SELECT * FROM main_fmcg.bronze.gross_price;")
df_bronze.show(10)

##standardization date format for month

In [0]:
from pyspark.sql import functions as F

df_bronze = df_bronze.withColumn(
    "month",
    F.coalesce(
        F.try_to_date("month", "yyyy-MM-dd"),
        F.try_to_date("month", "yyyy/MM/dd"),
        F.try_to_date("month", "dd/MM/yyyy"),
        F.try_to_date("month", "dd-MM-yyyy"),
        F.try_to_date("month", "yyyy-MM"),
        F.try_to_date("month", "MM/yyyy"),
        F.try_to_date("month", "MMM-yyyy"),
        F.try_to_date("month", "yyyyMM")
    )
)

df_bronze.show(10)



In [0]:
#find bad values that did not parse
df_bronze.filter(F.col("month").isNull()).select("month").show(truncate=False)


##standardization of gross_price(negative,bad values[unknown,not_available])

In [0]:


df_bronze = df_bronze.withColumn(
    "gross_price",
    F.abs(
        F.when(
            F.col("gross_price").rlike(r'^-?\d+(\.\d+)?$'),
            F.col("gross_price").cast("int")
        ).otherwise(0)
    )
)

df_bronze.show(10)

In [0]:
#find bad rows that have been converted to 0
df_bronze.filter(F.col("gross_price")==0) \
         .show(truncate=False)


##save to silver

In [0]:
df_bronze.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true")\
 .option("overwriteSchema", "true") \
 .mode("overwrite") \
 .saveAsTable("main_fmcg.silver.gross_price")