In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StructType, TimestampType, ArrayType, IntegerType, StringType

In [0]:
schema_path = "dbfs:/FileStore/inventory_analytics"

In [0]:
click_temp_schema = StructType(
    [
        StructField("timestamp", TimestampType(), False),
        StructField("user_id", IntegerType(), False),
        StructField("product_id", StringType(), False),
        StructField("pin_code", IntegerType(), False)  
    ]
)
click_schema = ArrayType(click_temp_schema)

purchase_temp_schema = StructType(
    [
        StructField("timestamp", TimestampType(), False),
        StructField("user_id", IntegerType(), False),
        StructField("product_id", StringType(), False),
        StructField("pin_code", IntegerType(), False),
        StructField("item_count", IntegerType(), False),
        StructField("event", StringType(), False)    
    ]
)
purchase_schema = ArrayType(purchase_temp_schema)

cart_temp_schema = StructType(
    [
        StructField("timestamp", TimestampType(), False),
        StructField("user_id", IntegerType(), False),
        StructField("product_id", StringType(), False),
        StructField("pin_code", IntegerType(), False)    
    ]
)
cart_schema = ArrayType(cart_temp_schema)

In [0]:

click_df = spark.read.format("parquet").option("path", f"{schema_path}/broze/product_click/").load()
cleaned_click_df = click_df.selectExpr("cast(value as string) as click_event")\
    .withColumn("click_event", F.from_json("click_event", click_schema).alias("click_event"))\
    .withColumn("click_event", F.explode("click_event").alias("click_event"))\
    .selectExpr("click_event.*")\
    .dropna(subset=["timestamp", "product_id"])

cleaned_click_df.write.format("delta").save(f"{schema_path}/silver/product_click/")

In [0]:

cart_df = spark.read.format("parquet").option("path", f"{schema_path}/bronze/product_cart/").load()
cleaned_cart_df = cart_df.selectExpr("cast(value as string) as cart_event")\
    .withColumn("cart_event", F.from_json("cart_event", cart_schema).alias("cart_event"))\
    .withColumn("cart_event", F.explode("cart_event").alias("cart_event"))\
    .selectExpr("cart_event.*")\
    .dropna(subset=["timestamp", "product_id"])
cleaned_cart_df.write.format("delta").option("overwrite", True).save(f"{schema_path}/silver/product_cart/")
    

In [0]:

purchase_df = spark.read.format("parquet").option("path", f"{schema_path}/bronze/product_purchase/").load()
cleaned_purchase_df = purchase_df.selectExpr("cast(value as string) as purchase_event")\
    .withColumn("purchase_event", F.from_json("purchase_event", purchase_schema).alias("purchase_event"))\
    .withColumn("purchase_event", F.explode("purchase_event").alias("purchase_event"))\
    .selectExpr("purchase_event.*")\
    .dropna(subset=["timestamp", "product_id"])
cleaned_purchase_df.write.format("delta").save(f"{schema_path}/silver/product_purchase/")
    