In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.table("ecommerce.silver.silver_events")

In [0]:
events.select("price").describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|         109819992|
|   mean|291.64427142266754|
| stddev|  356.728931686955|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+



In [0]:
from pyspark.sql.functions import *

events.groupBy("event_type") \
    .agg(
        count("*").alias("event_count"),
        round(avg("price"),2).alias("avg_price"),
        round(stddev("price"),2).alias("stddev_price")
    ) \
    .show()

+----------+-----------+---------+------------+
|event_type|event_count|avg_price|stddev_price|
+----------+-----------+---------+------------+
|  purchase|    1659703|   304.35|      344.79|
|      cart|    3828449|    300.8|      340.34|
|      view|  104331840|   291.11|      357.49|
+----------+-----------+---------+------------+



In [0]:
events2 = events.withColumn(
    "is_weekend",
    F.dayofweek("event_time").isin([1,7])
)

In [0]:
events2.filter(F.col("event_type") == "purchase") \
    .groupBy("is_weekend") \
    .agg(
        count("*").alias("total_purchases"),
        round(avg("price"), 2).alias("avg_price"),
        round(sum("price")/1000000, 2).alias("total_revenue"),
        countDistinct("user_id").alias("unique_customers"),
        round(
            sum("price") / countDistinct("user_id"), 2
        ).alias("avg_revenue_per_customer")
    ) \
    .show()

+----------+---------------+---------+-------------+----------------+------------------------+
|is_weekend|total_purchases|avg_price|total_revenue|unique_customers|avg_revenue_per_customer|
+----------+---------------+---------+-------------+----------------+------------------------+
|      true|         613081|   305.82|        187.5|          340234|                  551.08|
|     false|        1046622|   303.48|       317.63|          478221|                  664.19|
+----------+---------------+---------+-------------+----------------+------------------------+



In [0]:
user_metrics = events.filter(F.col("event_type") == "purchase") \
    .groupBy("user_id") \
    .agg(
        F.count("*").alias("purchase_count"),
        F.sum("price").alias("total_spent"),
        F.avg("price").alias("avg_price")
    )

In [0]:
user_metrics.stat.corr("purchase_count", "total_spent")

0.7548923730410833

In [0]:
user_metrics.stat.corr("avg_price", "purchase_count")

0.048153625770798125

In [0]:
brand_ab = events.withColumn(
    "has_brand",
    F.when(F.col("brand").isNotNull(), "Branded").otherwise("Non-Branded")
)

In [0]:
brand_ab.groupBy("has_brand") \
    .agg(
        F.count("*").alias("events"),
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases")
    ) \
    .withColumn(
        "conversion_rate",
        F.round(F.col("purchases") / F.col("events"), 4)
    ) \
    .show()

+-----------+--------+---------+---------------+
|  has_brand|  events|purchases|conversion_rate|
+-----------+--------+---------+---------------+
|    Branded|94498691|  1528221|         0.0162|
|Non-Branded|15321301|   131482|         0.0086|
+-----------+--------+---------+---------------+



In [0]:
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_time"))


In [0]:
features = features.withColumn(
    "price_log",
    F.log(F.col("price") + 1)
)


In [0]:
window_spec = Window.partitionBy("user_id").orderBy("event_time")

features = features.withColumn(
    "first_event_time",
    F.first("event_time").over(window_spec)
)
features = features.withColumn(
    "time_since_first_event",
    F.unix_timestamp(F.col("event_time")) - F.unix_timestamp(F.col("first_event_time"))
)

In [0]:
ml_ready = features.select(
    "user_id",
    "product_id",
    "price_log",
    "hour",
    "day_of_week",
    "time_since_first_event",
    "event_type"
)

In [0]:
ml_ready.describe().display()

summary,user_id,product_id,price_log,hour,day_of_week,time_since_first_event,event_type
count,109819992.0,109819992.0,109819992.0,109819992.0,109819992.0,109819992.0,109819992
mean,536661633.6912224,11761620.820934432,5.039776944563229,11.260400865809569,4.157798554565548,1481572.828841847,
stddev,21450016.78581021,15438424.080771811,1.2188362959637715,5.314890347639722,2.084552063221148,1471904.8301876974,
min,10300217.0,1000365.0,0.0,0.0,1.0,0.0,cart
max,579969851.0,100028554.0,7.853631997194365,23.0,7.0,5265514.0,view
