In [0]:
# Day 1: Platform Setup & First Steps
# Goals:
# 1) Validate Spark is available
# 2) Load Day 0 dataset (Oct 2019)
# 3) Run basic PySpark DataFrame operations

In [0]:
spark.version

In [0]:
#Load Oct 2019 from your Day 0 Volume
data_path_oct = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"
events = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(data_path_oct)
)
print(f"Loaded events: {events.count():,}")
events.printSchema()

In [0]:
events.show(5,truncate=False)

In [0]:
events.select("event_type", "brand", "price").show(10, truncate=False)

In [0]:
events.describe(["price"]).show()

In [0]:
from pyspark.sql.functions import col

(
    events.groupBy("brand")
    .count()
    .orderBy(col("count").desc())
    .limit(10)
    .show(truncate=False)
)


In [0]:
#Practice Block 1: Window functions
from pyspark.sql.window import Window
from pyspark.sql.functions import col, count, row_number

w = Window.partitionBy("event_type").orderBy(col("count").desc())

brand_by_event = (
    events
    .groupBy("event_type", "brand")
    .count()
    .withColumn("rn", row_number().over(w))
    .filter(col("rn") <= 5)
)

brand_by_event.show(truncate=False)

In [0]:
#Practice Block 2: Deduplication
from pyspark.sql.functions import countDistinct

display(
    events.agg(
        count("*").alias("total_rows"),
        countDistinct("event_time", "user_id", "product_id").alias("distinct_events")
    )
)

In [0]:
deduped = events.dropDuplicates(
    ["event_time", "user_id", "product_id"]
)

print(f"After deduplication: {deduped.count():,}")

In [0]:
#Practice Block 3: User behavior (business-relevant)
## Events Per User
(
    events
    .groupBy("user_id")
    .count()
    .orderBy(col("count").desc())
    .show(10)
)


In [0]:
#Practice Block 3: User behavior (business-relevant)
## Purchases per user
(
    events
    .filter(col("event_type") == "purchase")
    .groupBy("user_id")
    .count()
    orderBy(col("count").desc())
    .show(10)
)

In [0]:
events.count()