In [0]:
# Load data
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Top 5 products by revenue
revenue = (
    events.filter(F.col("event_type") == "purchase")
    .groupBy("product_id")
    .agg(F.sum("price").alias("revenue"))
    .orderBy(F.desc("revenue"))
    .limit(5)
)
display(revenue)

# Running total per user
window = Window.partitionBy("user_id").orderBy("event_time")
events_with_cumulative = events.withColumn(
    "cumulative_events", F.count("*").over(window)
)
display(events_with_cumulative)

# Conversion rate by category (manual pivot)
category_counts = (
    events.groupBy("category_code", "event_type")
    .count()
)

category_purchase = category_counts.filter(F.col("event_type") == "purchase").select(
    "category_code", F.col("count").alias("purchase_count")
)
category_view = category_counts.filter(F.col("event_type") == "view").select(
    "category_code", F.col("count").alias("view_count")
)

conversion = (
    category_view.join(category_purchase, "category_code", "outer")
    .withColumn(
        "conversion_rate",
        (F.col("purchase_count") / F.col("view_count")) * 100
    )
)
display(conversion)

product_id,revenue
1005115,12406807.350000003
1005105,10239248.679999996
1004249,6730112.920000011
1005135,5567806.640000007
1004767,5430723.430000007


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,cumulative_events
2019-10-09T10:30:19.000Z,view,17301541,2053013553853497655,,,162.17,205053188,e1eadbc6-aef5-4cff-bb1f-07d2b983a26e,1
2019-10-09T10:30:44.000Z,view,17301541,2053013553853497655,,,162.17,205053188,e1eadbc6-aef5-4cff-bb1f-07d2b983a26e,2
2019-10-07T06:23:01.000Z,view,16200119,2053013556344914381,kids.fmcg.diapers,moony,18.47,222907508,cb653adc-46a2-4d90-9e34-5bdfb2be30ce,1
2019-10-07T06:26:23.000Z,view,16200162,2053013556344914381,kids.fmcg.diapers,moony,18.47,222907508,cb653adc-46a2-4d90-9e34-5bdfb2be30ce,2
2019-10-08T14:29:09.000Z,view,6200883,2053013552293216471,appliances.environment.air_heater,elenberg,46.31,244673419,e2f0524c-bfc4-4c69-b93a-56f983027af3,1
2019-10-12T10:15:48.000Z,view,17300355,2053013553853497655,,creed,240.16,257849716,71e76013-465a-4644-b82f-ab7fc64c9e95,1
2019-10-22T22:05:40.000Z,view,3900896,2053013552326770905,appliances.environment.water_heater,klima,77.2,266203246,c83d6f3d-2973-411f-8180-e476e65bc54c,1
2019-10-24T01:14:36.000Z,view,3900896,2053013552326770905,appliances.environment.water_heater,klima,77.2,266203246,56944410-059f-4f08-939b-867b3e060741,2
2019-10-06T11:29:22.000Z,view,22700574,2053013556168753601,,,88.81,278272605,e4cd7037-61d8-461c-ba6f-4314f0fb9a6f,1
2019-10-06T11:30:58.000Z,view,22700129,2053013556168753601,,stels,66.93,278272605,0a20874c-c88c-4628-a6eb-886784b61d19,2


category_code,view_count,purchase_count,conversion_rate
stationery.cartrige,7380.0,134.0,1.815718157181572
electronics.video.tv,1055961.0,21565.0,2.042215574249428
accessories.wallet,43282.0,310.0,0.7162330761055404
appliances.kitchen.juicer,46240.0,554.0,1.1980968858131489
,13236458.0,,
construction.tools.welding,91255.0,1081.0,1.1845926250616403
appliances.environment.air_heater,153390.0,2483.0,1.6187495925418869
country_yard.furniture.hammok,1218.0,,
apparel.shoes,759646.0,4255.0,0.560129323395371
electronics.audio.microphone,28394.0,430.0,1.5144044516447135
