In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [68]:
spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

In [69]:
users = spark.read.option("header", True).csv("user_sessions.csv")
users = users.withColumn("event_time", to_timestamp("event_time")).withColumn("date", date_format("event_time", "yyyy-MM-dd"))

In [70]:
users.show()

+-------+-------------------+----------+-------+---------+-------+----------+----------+
|user_id|         event_time|event_type| device| location|browser|session_id|      date|
+-------+-------------------+----------+-------+---------+-------+----------+----------+
|    U21|2025-08-12 16:17:06|  purchase|desktop|Australia| Chrome|     S9440|2025-08-12|
|     U2|2025-08-12 04:31:21|  purchase| tablet|Australia|Firefox|     S9876|2025-08-12|
|    U18|2025-08-17 14:03:12|  purchase|desktop|    India|Firefox|     S7075|2025-08-17|
|    U45|2025-08-20 01:56:42|      view|desktop|Australia| Safari|     S7144|2025-08-20|
|    U21|2025-08-25 19:31:51|     login| mobile|   Canada| Safari|     S3814|2025-08-25|
|     U3|2025-08-13 12:16:10|     click|desktop|Australia|Firefox|     S6097|2025-08-13|
|     U5|2025-08-02 05:58:30|    logout| tablet|    India|  Opera|     S2232|2025-08-02|
|    U19|2025-08-21 04:21:33|  purchase| tablet|      USA| Safari|     S7053|2025-08-21|
|     U5|2025-08-21 1

In [72]:
main_metrics = users.groupBy("user_id", "date").agg(sum(when(col("event_type") == "login", 1).otherwise(0)).alias("num_logins"))
main_metrics = main_metrics.filter(main_metrics["num_logins"] == 1)
main_metrics.show()

+-------+----------+----------+
|user_id|      date|num_logins|
+-------+----------+----------+
|    U28|2025-08-08|         1|
|    U14|2025-08-23|         1|
|    U21|2025-08-28|         1|
|    U32|2025-08-19|         1|
|     U7|2025-08-06|         1|
|    U23|2025-08-17|         1|
|     U8|2025-08-26|         1|
|    U10|2025-08-11|         1|
|     U8|2025-08-10|         1|
|    U32|2025-08-20|         1|
|    U21|2025-08-20|         1|
|    U35|2025-08-21|         1|
|    U40|2025-08-15|         1|
|    U21|2025-08-10|         1|
|     U9|2025-08-22|         1|
|     U5|2025-08-05|         1|
|     U3|2025-08-21|         1|
|    U40|2025-08-28|         1|
|     U1|2025-08-26|         1|
|    U31|2025-08-28|         1|
+-------+----------+----------+
only showing top 20 rows



In [73]:
device_location_browser = users.groupBy("user_id", "date", "device", "location", "browser").agg(count("*").alias("event_count"))

In [74]:
device_location_browser.show(5)

+-------+----------+-------+--------+-------+-----------+
|user_id|      date| device|location|browser|event_count|
+-------+----------+-------+--------+-------+-----------+
|    U32|2025-08-08|desktop|      UK|  Opera|          1|
|    U40|2025-08-16| mobile|   India| Chrome|          1|
|     U5|2025-08-06| tablet|  Canada| Safari|          1|
|    U45|2025-08-21| mobile|   India|   Edge|          1|
|    U48|2025-08-12| mobile|   India|Firefox|          1|
+-------+----------+-------+--------+-------+-----------+
only showing top 5 rows



In [75]:
w = Window.partitionBy("user_id", "date").orderBy(col("event_count").desc())
ranked = device_location_browser.withColumn("rn", row_number().over(w))

In [76]:
most_device = ranked.filter(col("rn") == 1).select("user_id", "date", col("device").alias("most_device"))
top_location = ranked.filter(col("rn") == 1).select("user_id", "date", col("location").alias("top_location"))
unique_browsers = users.groupBy("user_id", "date").agg(countDistinct("browser").alias("unique_browsers"))

In [77]:
main_output = main_metrics.join(most_device, ["user_id", "date"]).join(top_location, ["user_id", "date"]).join(unique_browsers, ["user_id", "date"])
main_output.show()

+-------+----------+----------+-----------+------------+---------------+
|user_id|      date|num_logins|most_device|top_location|unique_browsers|
+-------+----------+----------+-----------+------------+---------------+
|     U1|2025-08-02|         1|     mobile|   Australia|              2|
|     U1|2025-08-08|         1|    desktop|   Australia|              1|
|     U1|2025-08-22|         1|     tablet|   Australia|              2|
|     U1|2025-08-25|         1|    desktop|          UK|              1|
|     U1|2025-08-26|         1|     tablet|   Australia|              1|
|    U10|2025-08-07|         1|    desktop|          UK|              2|
|    U10|2025-08-10|         1|     mobile|         USA|              2|
|    U10|2025-08-11|         1|     mobile|      Canada|              1|
|    U10|2025-08-19|         1|     tablet|         USA|              1|
|    U11|2025-08-28|         1|     tablet|         USA|              4|
|    U12|2025-08-16|         1|    desktop|   Austr

In [78]:
event_counts = users.groupBy("user_id", "date").pivot("event_type", ["login", "logout", "click", "view", "purchase"]).agg(count("*")).na.fill(0)

event_counts = event_counts \
    .withColumnRenamed("login", "login_count") \
    .withColumnRenamed("logout", "logout_count") \
    .withColumnRenamed("click", "click_count") \
    .withColumnRenamed("view", "view_count") \
    .withColumnRenamed("purchase", "purchase_count")

In [79]:
event_counts.orderBy("user_id").show()

+-------+----------+-----------+------------+-----------+----------+--------------+
|user_id|      date|login_count|logout_count|click_count|view_count|purchase_count|
+-------+----------+-----------+------------+-----------+----------+--------------+
|     U1|2025-08-24|          0|           0|          0|         0|             1|
|     U1|2025-08-26|          1|           0|          0|         0|             0|
|     U1|2025-08-25|          1|           0|          0|         0|             0|
|     U1|2025-08-07|          0|           0|          0|         0|             1|
|     U1|2025-08-22|          1|           0|          1|         0|             0|
|     U1|2025-08-12|          0|           0|          2|         0|             0|
|     U1|2025-08-05|          0|           0|          0|         1|             0|
|     U1|2025-08-01|          0|           0|          1|         0|             1|
|     U1|2025-08-15|          0|           1|          0|         0|        