In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

25/09/02 14:12:13 WARN Utils: Your hostname, neosoft-Latitude-5420 resolves to a loopback address: 127.0.1.1; using 10.0.61.246 instead (on interface wlp0s20f3)
25/09/02 14:12:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/02 14:12:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df = spark.read.option("header", True).option("inferSchema", True).csv("online_learning_activity.csv")

In [8]:
df = df.withColumn("activity_time", to_timestamp("activity_time")).withColumn("date", date_format("activity_time", "yyyy-MM-dd"))

In [9]:
df.show()

+-----------+----------+---------+---------------+-------------------+-------+---------+----------------+----------+
|activity_id|student_id|course_id|  activity_type|      activity_time| device| location|duration_minutes|      date|
+-----------+----------+---------+---------------+-------------------+-------+---------+----------------+----------+
|    A274850|       S27|      C13|Complete_Course|2025-08-21 08:20:00| Tablet|   Canada|            NULL|2025-08-21|
|    A110285|       S25|       C8|     Forum_Post|2025-08-21 06:09:00| Tablet|   Canada|            NULL|2025-08-21|
|    A490154|       S72|      C15|    Watch_Video|2025-08-26 13:36:00|Desktop|      USA|            33.0|2025-08-26|
|    A618340|      S100|      C18|          Login|2025-08-25 13:38:00| Mobile|       UK|            NULL|2025-08-25|
|    A245486|      S126|      C10|          Login|2025-08-25 10:14:00|Desktop|    India|            NULL|2025-08-25|
|    A847712|        S7|      C17|          Login|2025-08-26 15:

In [19]:
df.filter(col("device").isNull()).count()

0

In [20]:
df.filter(col("location").isNull()).count()

0

In [26]:
df.filter(col("duration_minutes").isNull()).count()

0

In [25]:
df = df.fillna({"device":"Unknown", "location":"Unknown", "duration_minutes":0})

In [None]:
daily_metrics = df.groupBy("date").agg(
    count("student_id").alias("Total_active_student_per_day"),
    count("activity_id").alias("Total_activities_per_day"),
    avg("duration_minutes").alias("Avg_session_duration_per_day")
)
daily_metrics.show()

+----------+----------------------------+------------------------+----------------------------+
|      date|Total_active_student_per_day|Total_activities_per_day|Avg_session_duration_per_day|
+----------+----------------------------+------------------------+----------------------------+
|2025-08-20|                         400|                     400|                     10.5475|
|2025-08-25|                         400|                     400|                      13.825|
|2025-08-22|                         400|                     400|                     11.1625|
|2025-08-21|                         400|                     400|                       12.49|
|2025-08-23|                         400|                     400|                       12.08|
|2025-08-27|                           1|                       1|                         9.0|
|2025-08-24|                         400|                     400|                      11.785|
|2025-08-26|                         399

In [33]:
device = df.groupBy("device").agg(count("*").alias("distribution_amt"))
device.show()

+-------+----------------+
| device|distribution_amt|
+-------+----------------+
|Unknown|              65|
| Mobile|             896|
| Tablet|             958|
|Desktop|             881|
+-------+----------------+



In [34]:
loc = df.groupBy("location").agg(count("*").alias("distribution_amt"))
loc.show()

+---------+----------------+
| location|distribution_amt|
+---------+----------------+
|    India|             545|
|  Unknown|              63|
|      USA|             560|
|       UK|             568|
|   Canada|             527|
|Australia|             537|
+---------+----------------+



In [51]:
course_metrics = df.groupBy("course_id").agg(
    (sum(when(col("activity_type") == "Complete_Course",1).otherwise(0)) / count("*") * 100).alias("Course_completion_rate"),
    (sum("duration_minutes").alias("engagement_time")))
course_metrics.orderBy(col("engagement_time").desc()).show()

+---------+----------------------+---------------+
|course_id|Course_completion_rate|engagement_time|
+---------+----------------------+---------------+
|      C10|    21.341463414634145|         2222.0|
|      C15|     22.07792207792208|         2086.0|
|       C5|     20.37037037037037|         2058.0|
|      C20|                  20.0|         1965.0|
|      C17|    17.006802721088434|         1886.0|
|      C18|     24.82758620689655|         1867.0|
|       C3|     23.78048780487805|         1860.0|
|       C7|    10.948905109489052|         1819.0|
|      C11|     20.27972027972028|         1773.0|
|       C6|    18.939393939393938|         1768.0|
|      C14|     18.84057971014493|         1767.0|
|      C13|    19.148936170212767|         1518.0|
|       C1|    19.696969696969695|         1482.0|
|       C4|     20.66115702479339|         1427.0|
|      C16|                  20.0|         1405.0|
|       C8|     21.21212121212121|         1404.0|
|       C2|     18.309859154929

In [55]:
unique_std = df.groupBy("course_id","date").agg(countDistinct("student_id").alias("Unique_Students"))
unique_std.orderBy("date","course_id").show()

+---------+----------+---------------+
|course_id|      date|Unique_Students|
+---------+----------+---------------+
|       C1|2025-08-20|             18|
|      C10|2025-08-20|             17|
|      C11|2025-08-20|             23|
|      C12|2025-08-20|             17|
|      C13|2025-08-20|             13|
|      C14|2025-08-20|             22|
|      C15|2025-08-20|             26|
|      C16|2025-08-20|             16|
|      C17|2025-08-20|             22|
|      C18|2025-08-20|             31|
|      C19|2025-08-20|             17|
|       C2|2025-08-20|             13|
|      C20|2025-08-20|             17|
|       C3|2025-08-20|             24|
|       C4|2025-08-20|             19|
|       C5|2025-08-20|             24|
|       C6|2025-08-20|             16|
|       C7|2025-08-20|             12|
|       C8|2025-08-20|             15|
|       C9|2025-08-20|             14|
+---------+----------+---------------+
only showing top 20 rows



In [57]:
std_behavior_metrics = df.groupBy("student_id","date").agg(
    avg("duration_minutes").alias("avg_std_per_std_day")
)
std_behavior_metrics.orderBy("date","student_id").show()

+----------+----------+-------------------+
|student_id|      date|avg_std_per_std_day|
+----------+----------+-------------------+
|        S1|2025-08-20|                0.0|
|       S10|2025-08-20|                0.0|
|      S100|2025-08-20|                0.0|
|      S101|2025-08-20|               13.8|
|      S103|2025-08-20|                0.0|
|      S105|2025-08-20| 15.666666666666666|
|      S106|2025-08-20|               56.0|
|      S107|2025-08-20|                0.0|
|      S108|2025-08-20|                0.0|
|      S109|2025-08-20|                0.0|
|       S11|2025-08-20|               9.75|
|      S110|2025-08-20| 17.333333333333332|
|      S111|2025-08-20|                3.0|
|      S112|2025-08-20|  6.571428571428571|
|      S113|2025-08-20|                0.0|
|      S114|2025-08-20|               23.0|
|      S115|2025-08-20|               18.0|
|      S116|2025-08-20|                3.5|
|      S117|2025-08-20|                4.0|
|      S118|2025-08-20|         

In [61]:
quizzes = df.groupBy("student_id").agg(count(when(col("activity_type") == "Attempt_Quiz", 1).otherwise(0)).alias("num_quizzes_per_student"))
quizzes.show()

+----------+-----------------------+
|student_id|num_quizzes_per_student|
+----------+-----------------------+
|       S68|                     14|
|       S39|                     21|
|       S60|                     17|
|      S149|                     19|
|      S105|                     18|
|       S10|                     20|
|      S148|                     24|
|       S27|                     15|
|        S6|                     23|
|      S106|                     18|
|      S102|                     16|
|       S94|                     22|
|        S8|                     17|
|       S32|                     18|
|        S7|                     19|
|       S30|                     18|
|      S104|                     16|
|      S120|                     15|
|       S56|                     28|
|       S76|                     20|
+----------+-----------------------+
only showing top 20 rows



In [66]:
df = df.withColumn("year", year("activity_time")).withColumn("week", weekofyear("activity_time"))

In [67]:
completed = df.filter(col("activity_type") == "Complete_Course")
student_weekly_completions = completed.groupBy("student_id", "year", "week").agg(
    countDistinct("course_id").alias("courses_completed"))

student_weekly_completions.filter(col("courses_completed") >= 1).show()

+----------+----+----+-----------------+
|student_id|year|week|courses_completed|
+----------+----+----+-----------------+
|      S101|2025|  35|                1|
|        S8|2025|  34|                4|
|       S95|2025|  34|                1|
|       S11|2025|  35|                1|
|       S18|2025|  35|                2|
|      S117|2025|  35|                2|
|       S28|2025|  34|                3|
|       S37|2025|  35|                1|
|       S41|2025|  34|                5|
|       S78|2025|  35|                1|
|      S144|2025|  35|                1|
|      S126|2025|  34|                2|
|      S130|2025|  35|                1|
|      S140|2025|  34|                1|
|      S110|2025|  35|                1|
|      S129|2025|  35|                1|
|      S142|2025|  34|                5|
|       S50|2025|  35|                1|
|      S104|2025|  34|                1|
|       S29|2025|  34|                3|
+----------+----+----+-----------------+
only showing top