In [0]:
# Import required libraries
from pyspark.sql import functions as F
path = "/Volumes/workspace/advecom/advecom_data/2019-Oct.csv"
bronze_path = "/Volumes/workspace/advecom/advecom_data/delta/bronze/events"

# BRONZE layer
# Ingest raw data
raw = spark.read.csv(path, header=True, inferSchema=True)
raw.withColumn("ingestion_ts", F.current_timestamp()) \
   .write.format("delta").mode("overwrite").save(bronze_path)

raw = spark.read.format("delta").load(bronze_path)

display(raw.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,ingestion_ts
2019-10-10T10:15:26.000Z,view,12707254,2053013553559896355,,,39.64,550663441,b69f9a29-3ffa-44c4-8ad8-911082f2d575,2026-02-21T18:25:13.346Z
2019-10-10T10:15:26.000Z,view,4804056,2053013554658804075,electronics.audio.headphone,apple,161.88,537721718,656e85d5-5fb3-4675-93e7-4f45eb3486a7,2026-02-21T18:25:13.346Z
2019-10-10T10:15:26.000Z,view,10700055,2053013561277416167,,,35.52,542384688,96fe9aa3-a564-47e1-9d8f-dbfa86df8a68,2026-02-21T18:25:13.346Z
2019-10-10T10:15:26.000Z,view,44900106,2105319819401232597,,yamaha,719.35,519229425,4f6c5ffe-b07e-4b25-8a64-15bac3ab8591,2026-02-21T18:25:13.346Z
2019-10-10T10:15:26.000Z,view,1005157,2053013555631882655,electronics.smartphone,xiaomi,282.89,518662178,c95712e2-3d2d-4264-bb76-785f088aec51,2026-02-21T18:25:13.346Z


In [0]:
# Import required libraries
from pyspark.sql import functions as F

# Load saved bronze layer data
bronze_df = spark.read.format("delta").load(bronze_path)

# Clean data: silver layer
silver_df = bronze_df.filter(F.col("price") > 0) \
    .filter("user_id IS NOT NULL") \
    .dropDuplicates(["user_id"]) \
    .groupBy("user_id") \
    .agg(F.count("*").alias("total_events"),
    F.sum(
    F.when(F.col("event_type")=="view",1)
    .otherwise(0)
    ).alias("total_views"),

    F.sum(
    F.when(F.col("event_type")=="cart",1)
    .otherwise(0)
    ).alias("total_cart"),

    F.sum(
    F.when(F.col("event_type")=="purchase",1)
    .otherwise(0)
    ).alias("total_purchases"),

    F.sum("price").alias("total_spent"))
    
# Save silver layer data   
silver_df.write.format("delta").mode("overwrite").save("/Volumes/workspace/advecom/advecom_data/delta/silver/events")
display(silver_df.limit(5))

user_id,total_events,total_views,total_cart,total_purchases,total_spent
515993713,1,1,0,0,111.7
526855580,1,1,0,0,154.42
513017380,1,1,0,0,241.71
513218277,1,1,0,0,98.46
513387588,1,1,0,0,118.66


In [0]:
# Load user features table
user_features = spark.read.format("delta").load("/Volumes/workspace/advecom/advecom_data/delta/silver/events")

# Feature validity checks
print("Total users:",user_features.count())
print("Distinct users:",user_features.select("user_id").distinct().count())
print("Number of users having null total spent:",user_features.filter("total_spent IS NULL").count())
print("Number of users having negative total spent:",user_features.filter("total_spent < 0").count())

Total users: 3021435
Distinct users: 3021435
Number of users having null total spent: 0
Number of users having negative total spent: 0


In [0]:
# Numerical summary of user features table
display(
user_features.describe()
)

summary,user_id,total_events,total_views,total_cart,total_purchases,total_spent
count,3021435.0,3021435.0,3021435.0,3021435.0,3021435.0,3021435.0
mean,540466832.8584484,1.0,0.9994307340717242,0.00031011754348513206,0.00025914838479067063,312.5374076784962
stddev,19468757.72137334,0.0,0.0238525062180191,0.0176074266490511,0.0160960030023947,381.7059849298083
min,33869381.0,1.0,0.0,0.0,0.0,0.77
max,566280860.0,1.0,1.0,1.0,1.0,2574.07
