In [22]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("OULAD Analysis") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


In [23]:
data_courses = spark.read.csv("file:///home/studen/mickael/python_data_engineer/hadoop/spark/OULAD\ dataset/courses.csv", header=True, inferSchema=True)
data_assessments = spark.read.csv("file:///home/studen/mickael/python_data_engineer/hadoop/spark/OULAD\ dataset/assessments.csv", header=True, inferSchema=True)
data_studentAssessment = spark.read.csv("file:///home/studen/mickael/python_data_engineer/hadoop/spark/OULAD\ dataset/studentAssessment.csv", header=True, inferSchema=True)
data_studentInfo = spark.read.csv("file:///home/studen/mickael/python_data_engineer/hadoop/spark/OULAD\ dataset/studentInfo.csv", header=True, inferSchema=True)
data_studentRegistration = spark.read.csv("file:///home/studen/mickael/python_data_engineer/hadoop/spark/OULAD\ dataset/studentRegistration.csv", header=True, inferSchema=True)

In [24]:
data_courses.show(3)
data_assessments.show(15)

+-----------+-----------------+--------------------------+
|code_module|code_presentation|module_presentation_length|
+-----------+-----------------+--------------------------+
|        AAA|            2013J|                       268|
|        AAA|            2014J|                       269|
|        BBB|            2013J|                       268|
+-----------+-----------------+--------------------------+
only showing top 3 rows

+-----------+-----------------+-------------+---------------+----+------+
|code_module|code_presentation|id_assessment|assessment_type|date|weight|
+-----------+-----------------+-------------+---------------+----+------+
|        AAA|            2013J|         1752|            TMA|  19|  10.0|
|        AAA|            2013J|         1753|            TMA|  54|  20.0|
|        AAA|            2013J|         1754|            TMA| 117|  20.0|
|        AAA|            2013J|         1755|            TMA| 166|  20.0|
|        AAA|            2013J|         1756

In [25]:
data_studentAssessment.show(5)
data_studentInfo.show(5)

+-------------+----------+--------------+---------+-----+
|id_assessment|id_student|date_submitted|is_banked|score|
+-------------+----------+--------------+---------+-----+
|         1752|     11391|            18|        0|   78|
|         1752|     28400|            22|        0|   70|
|         1752|     31604|            17|        0|   72|
|         1752|     32885|            26|        0|   69|
|         1752|     38053|            19|        0|   79|
+-------------+----------+--------------+---------+-----+
only showing top 5 rows

+-----------+-----------------+----------+------+--------------------+--------------------+--------+--------+--------------------+---------------+----------+------------+
|code_module|code_presentation|id_student|gender|              region|   highest_education|imd_band|age_band|num_of_prev_attempts|studied_credits|disability|final_result|
+-----------+-----------------+----------+------+--------------------+--------------------+--------+--------+--

In [26]:
data_studentRegistration.show(5)


+-----------+-----------------+----------+-----------------+-------------------+
|code_module|code_presentation|id_student|date_registration|date_unregistration|
+-----------+-----------------+----------+-----------------+-------------------+
|        AAA|            2013J|     11391|             -159|               NULL|
|        AAA|            2013J|     28400|              -53|               NULL|
|        AAA|            2013J|     30268|              -92|                 12|
|        AAA|            2013J|     31604|              -52|               NULL|
|        AAA|            2013J|     32885|             -176|               NULL|
+-----------+-----------------+----------+-----------------+-------------------+
only showing top 5 rows



## **Basic** SQL Questions

### 2. Find the number of students enrolled in each course.


In [27]:
# - Find the number of students enrolled in each course.
data_studentInfo.createOrReplaceTempView("studentInfo")

spark.sql("select code_module, code_presentation, count(id_student) as count from studentInfo group by code_module, code_presentation").show(5)

+-----------+-----------------+-----+
|code_module|code_presentation|count|
+-----------+-----------------+-----+
|        FFF|            2013J| 2283|
|        BBB|            2013J| 2237|
|        BBB|            2014J| 2292|
|        EEE|            2014J| 1188|
|        BBB|            2014B| 1613|
+-----------+-----------------+-----+
only showing top 5 rows



In [28]:
data_studentInfo.select("code_module", "code_presentation").groupBy("code_module", "code_presentation").count().show(5)


+-----------+-----------------+-----+
|code_module|code_presentation|count|
+-----------+-----------------+-----+
|        FFF|            2013J| 2283|
|        BBB|            2013J| 2237|
|        BBB|            2014J| 2292|
|        EEE|            2014J| 1188|
|        BBB|            2014B| 1613|
+-----------+-----------------+-----+
only showing top 5 rows



### 4.Find the distinct modules available.


In [29]:
data_courses.select("code_module").distinct().show()

+-----------+
|code_module|
+-----------+
|        CCC|
|        BBB|
|        DDD|
|        FFF|
|        EEE|
|        AAA|
|        GGG|
+-----------+



In [30]:
data_courses.createOrReplaceTempView("courses")
spark.sql("select code_module from courses group by code_module ").show()

+-----------+
|code_module|
+-----------+
|        CCC|
|        BBB|
|        DDD|
|        FFF|
|        EEE|
|        AAA|
|        GGG|
+-----------+



In [31]:
data_studentInfo.show(5)

+-----------+-----------------+----------+------+--------------------+--------------------+--------+--------+--------------------+---------------+----------+------------+
|code_module|code_presentation|id_student|gender|              region|   highest_education|imd_band|age_band|num_of_prev_attempts|studied_credits|disability|final_result|
+-----------+-----------------+----------+------+--------------------+--------------------+--------+--------+--------------------+---------------+----------+------------+
|        AAA|            2013J|     11391|     M| East Anglian Region|    HE Qualification| 90-100%|    55<=|                   0|            240|         N|        Pass|
|        AAA|            2013J|     28400|     F|            Scotland|    HE Qualification|  20-30%|   35-55|                   0|             60|         N|        Pass|
|        AAA|            2013J|     30268|     F|North Western Region|A Level or Equiva...|  30-40%|   35-55|                   0|             60

### 6. Get all students who have withdrawn from their course


In [32]:
spark.sql("select code_module, code_presentation, id_student from studentInfo where final_result = 'Withdrawn'").show(5)

+-----------+-----------------+----------+
|code_module|code_presentation|id_student|
+-----------+-----------------+----------+
|        AAA|            2013J|     30268|
|        AAA|            2013J|     65002|
|        AAA|            2013J|     94961|
|        AAA|            2013J|    106247|
|        AAA|            2013J|    129955|
+-----------+-----------------+----------+
only showing top 5 rows



In [33]:
data_studentInfo.select("code_module", "code_presentation", "id_student").filter(data_studentInfo.final_result == 'Withdrawn').show(5)

+-----------+-----------------+----------+
|code_module|code_presentation|id_student|
+-----------+-----------------+----------+
|        AAA|            2013J|     30268|
|        AAA|            2013J|     65002|
|        AAA|            2013J|     94961|
|        AAA|            2013J|    106247|
|        AAA|            2013J|    129955|
+-----------+-----------------+----------+
only showing top 5 rows



### 8. List students who registered late (after course start date).


In [34]:
data_studentRegistration.select("code_module", "code_presentation", "id_student").filter(data_studentRegistration.date_registration > 0).show(5)

+-----------+-----------------+----------+
|code_module|code_presentation|id_student|
+-----------+-----------------+----------+
|        AAA|            2013J|    106247|
|        AAA|            2013J|    236284|
|        AAA|            2013J|    341872|
|        AAA|            2013J|   1472925|
|        AAA|            2014J|    199897|
+-----------+-----------------+----------+
only showing top 5 rows



In [35]:
data_studentRegistration.createOrReplaceTempView("studentRegistration")
spark.sql("select code_module, code_presentation, id_student from studentRegistration where date_registration > 0 ").show(5)

+-----------+-----------------+----------+
|code_module|code_presentation|id_student|
+-----------+-----------------+----------+
|        AAA|            2013J|    106247|
|        AAA|            2013J|    236284|
|        AAA|            2013J|    341872|
|        AAA|            2013J|   1472925|
|        AAA|            2014J|    199897|
+-----------+-----------------+----------+
only showing top 5 rows



### 10. Get the number of assessment submissions per student.


In [36]:
data_studentAssessment.select("id_student").groupby("id_student").count().show(5)

+----------+-----+
|id_student|count|
+----------+-----+
|    180753|    5|
|    324084|    5|
|   2057803|    5|
|    486656|   11|
|    502604|   11|
+----------+-----+
only showing top 5 rows



In [37]:
data_studentAssessment.createOrReplaceTempView("studentAssessment")
spark.sql("select id_student, count(*) as count from studentAssessment group by id_student").show(5)

+----------+-----+
|id_student|count|
+----------+-----+
|    180753|    5|
|    324084|    5|
|   2057803|    5|
|    486656|   11|
|    502604|   11|
+----------+-----+
only showing top 5 rows



### 12. Find all assessments of module ‘AAA’.


In [38]:
data_assessments.show()

+-----------+-----------------+-------------+---------------+----+------+
|code_module|code_presentation|id_assessment|assessment_type|date|weight|
+-----------+-----------------+-------------+---------------+----+------+
|        AAA|            2013J|         1752|            TMA|  19|  10.0|
|        AAA|            2013J|         1753|            TMA|  54|  20.0|
|        AAA|            2013J|         1754|            TMA| 117|  20.0|
|        AAA|            2013J|         1755|            TMA| 166|  20.0|
|        AAA|            2013J|         1756|            TMA| 215|  30.0|
|        AAA|            2013J|         1757|           Exam|NULL| 100.0|
|        AAA|            2014J|         1758|            TMA|  19|  10.0|
|        AAA|            2014J|         1759|            TMA|  54|  20.0|
|        AAA|            2014J|         1760|            TMA| 117|  20.0|
|        AAA|            2014J|         1761|            TMA| 166|  20.0|
|        AAA|            2014J|       

In [39]:
data_assessments.createOrReplaceTempView("assessments")
spark.sql("select * from assessments where code_module = 'AAA'").show()

+-----------+-----------------+-------------+---------------+----+------+
|code_module|code_presentation|id_assessment|assessment_type|date|weight|
+-----------+-----------------+-------------+---------------+----+------+
|        AAA|            2013J|         1752|            TMA|  19|  10.0|
|        AAA|            2013J|         1753|            TMA|  54|  20.0|
|        AAA|            2013J|         1754|            TMA| 117|  20.0|
|        AAA|            2013J|         1755|            TMA| 166|  20.0|
|        AAA|            2013J|         1756|            TMA| 215|  30.0|
|        AAA|            2013J|         1757|           Exam|NULL| 100.0|
|        AAA|            2014J|         1758|            TMA|  19|  10.0|
|        AAA|            2014J|         1759|            TMA|  54|  20.0|
|        AAA|            2014J|         1760|            TMA| 117|  20.0|
|        AAA|            2014J|         1761|            TMA| 166|  20.0|
|        AAA|            2014J|       

## **Intermediate** SQL Questions



### 1. Get the average score per module.


In [40]:
data_studentAssessment.createOrReplaceTempView("studentAssessment")
data_assessments.createOrReplaceTempView("assessments")


spark.sql("select a.code_module, AVG(sa.score) as avg from assessments a join studentAssessment sa on a.id_assessment = sa.id_assessment group by a.code_module").show()

+-----------+-----------------+
|code_module|              avg|
+-----------+-----------------+
|        CCC| 73.2613978551429|
|        BBB| 76.7063682263431|
|        DDD|70.09079993509654|
|        FFF|77.70759006007047|
|        EEE|81.18006593963987|
|        AAA|69.03051493960585|
|        GGG|79.70049293460401|
+-----------+-----------------+



In [41]:
data_studentAssessment.join(data_assessments, data_studentAssessment.id_assessment == data_assessments.id_assessment)\
    .groupby("code_module").agg({"score": "avg"}).show()

+-----------+-----------------+
|code_module|       avg(score)|
+-----------+-----------------+
|        CCC| 73.2613978551429|
|        BBB| 76.7063682263431|
|        DDD|70.09079993509654|
|        FFF|77.70759006007047|
|        EEE|81.18006593963987|
|        AAA|69.03051493960585|
|        GGG|79.70049293460401|
+-----------+-----------------+



### 2.Find the students who scored the highest in any assessment.


In [42]:
max_scores = data_studentAssessment.groupby("id_assessment") \
    .agg(F.max("score").alias("max_score"))

max_scores.show(5)

max_scores.join(data_studentAssessment, 
    (data_studentAssessment.id_assessment == max_scores.id_assessment)
    & (data_studentAssessment.score == max_scores.max_score)
    ).show() 

+-------------+---------+
|id_assessment|max_score|
+-------------+---------+
|        15003|      100|
|        15004|      100|
|        34878|      100|
|        14997|       96|
|        34870|      100|
+-------------+---------+
only showing top 5 rows

+-------------+---------+-------------+----------+--------------+---------+-----+
|id_assessment|max_score|id_assessment|id_student|date_submitted|is_banked|score|
+-------------+---------+-------------+----------+--------------+---------+-----+
|         1752|       94|         1752|   2555340|            18|        0|   94|
|         1753|       95|         1753|    296332|            52|        0|   95|
|         1754|       95|         1754|   2458355|           117|        0|   95|
|         1755|       94|         1755|    376439|           166|        0|   94|
|         1755|       94|         1755|   2694424|           162|        0|   94|
|         1756|       98|         1756|   2536991|           215|        0|   98|
|  

### 3. Calculate the overall average score of students who passed.


In [58]:
pass_student = data_studentInfo.select("code_module","code_presentation","id_student","final_result").filter(data_studentInfo.final_result.isin(["Pass", "Distinction"]))
pass_student.show(5)

+-----------+-----------------+----------+------------+
|code_module|code_presentation|id_student|final_result|
+-----------+-----------------+----------+------------+
|        AAA|            2013J|     11391|        Pass|
|        AAA|            2013J|     28400|        Pass|
|        AAA|            2013J|     31604|        Pass|
|        AAA|            2013J|     32885|        Pass|
|        AAA|            2013J|     38053|        Pass|
+-----------+-----------------+----------+------------+
only showing top 5 rows



In [51]:
course_ass = data_studentAssessment.join(data_assessments, data_studentAssessment.id_assessment == data_assessments.id_assessment)
course_ass.select("code_module","code_presentation",data_assessments.id_assessment,"id_student","score").show(5)

+-----------+-----------------+-------------+----------+-----+
|code_module|code_presentation|id_assessment|id_student|score|
+-----------+-----------------+-------------+----------+-----+
|        AAA|            2013J|         1752|     11391|   78|
|        AAA|            2013J|         1752|     28400|   70|
|        AAA|            2013J|         1752|     31604|   72|
|        AAA|            2013J|         1752|     32885|   69|
|        AAA|            2013J|         1752|     38053|   79|
+-----------+-----------------+-------------+----------+-----+
only showing top 5 rows



In [74]:
res = pass_student.join(course_ass, on=["code_module", "code_presentation", "id_student"])
res.show(5)

+-----------+-----------------+----------+------------+-------------+--------------+---------+-----+-------------+---------------+----+------+
|code_module|code_presentation|id_student|final_result|id_assessment|date_submitted|is_banked|score|id_assessment|assessment_type|date|weight|
+-----------+-----------------+----------+------------+-------------+--------------+---------+-----+-------------+---------------+----+------+
|        AAA|            2013J|     11391|        Pass|         1752|            18|        0|   78|         1752|            TMA|  19|  10.0|
|        AAA|            2013J|     28400|        Pass|         1752|            22|        0|   70|         1752|            TMA|  19|  10.0|
|        AAA|            2013J|     31604|        Pass|         1752|            17|        0|   72|         1752|            TMA|  19|  10.0|
|        AAA|            2013J|     32885|        Pass|         1752|            26|        0|   69|         1752|            TMA|  19|  10.0|

In [75]:
res.select(F.avg("score")).show()

+----------------+
|      avg(score)|
+----------------+
|79.1411761149361|
+----------------+



In [76]:
res.groupby("code_module", "code_presentation").agg({"score": "avg"}).show(5)

+-----------+-----------------+-----------------+
|code_module|code_presentation|       avg(score)|
+-----------+-----------------+-----------------+
|        FFF|            2013J| 79.5139337952271|
|        BBB|            2013J|80.76989093509745|
|        BBB|            2014J|67.92098377812664|
|        EEE|            2014J|84.20379323168464|
|        BBB|            2014B| 81.1468791500664|
+-----------+-----------------+-----------------+
only showing top 5 rows



### 4. Find courses where more than 50% of students passed.


In [94]:
pass_student = data_studentInfo.select("code_module","code_presentation","id_student","final_result")\
    .filter(data_studentInfo.final_result.isin(["Pass", "Distinction"]))\
    
pass_courses = pass_student.groupby("code_module","code_presentation").agg(F.count("*").alias("num_of_pass_student"))
pass_courses.show(5)



+-----------+-----------------+-------------------+
|code_module|code_presentation|num_of_pass_student|
+-----------+-----------------+-------------------+
|        FFF|            2013J|               1095|
|        BBB|            2013J|               1072|
|        BBB|            2014J|               1152|
|        EEE|            2014J|                684|
|        BBB|            2014B|                727|
+-----------+-----------------+-------------------+
only showing top 5 rows



In [95]:
all_cours = data_studentInfo.select("code_module","code_presentation","id_student","final_result")\
    .groupby("code_module","code_presentation")\
    .agg(F.count("*").alias("total_of_student"))
all_cours.show(5)

+-----------+-----------------+----------------+
|code_module|code_presentation|total_of_student|
+-----------+-----------------+----------------+
|        FFF|            2013J|            2283|
|        BBB|            2013J|            2237|
|        BBB|            2014J|            2292|
|        EEE|            2014J|            1188|
|        BBB|            2014B|            1613|
+-----------+-----------------+----------------+
only showing top 5 rows



In [119]:
from pyspark.sql.functions import col

# pass_courses.join(all_cours, on=["code_module","code_presentation"]).show()

result = pass_courses.join(all_cours, on=["code_module", "code_presentation"])\
    .withColumn("avg_pass", col("num_of_pass_student") / col("total_of_student") * 100)

result.filter(col("avg_pass")>50).show()   

+-----------+-----------------+-------------------+----------------+------------------+
|code_module|code_presentation|num_of_pass_student|total_of_student|          avg_pass|
+-----------+-----------------+-------------------+----------------+------------------+
|        BBB|            2014J|               1152|            2292| 50.26178010471204|
|        EEE|            2014J|                684|            1188| 57.57575757575758|
|        GGG|            2014B|                478|             833| 57.38295318127251|
|        GGG|            2014J|                444|             749|59.279038718291055|
|        AAA|            2014J|                253|             365| 69.31506849315069|
|        GGG|            2013J|                592|             952| 62.18487394957983|
|        AAA|            2013J|                278|             383| 72.58485639686684|
|        EEE|            2013J|                609|            1052| 57.88973384030418|
|        EEE|            2014B| 

### 5. Find students with a final result of ‘Distinction’ but never submitted an assessment.


In [115]:
distinct_stud = data_studentInfo.select("code_module","code_presentation","id_student","final_result")\
    .filter(col("final_result") == "Distinction")
    
distinct_stud.show(5)

+-----------+-----------------+----------+------------+
|code_module|code_presentation|id_student|final_result|
+-----------+-----------------+----------+------------+
|        AAA|            2013J|    134143| Distinction|
|        AAA|            2013J|    187100| Distinction|
|        AAA|            2013J|    227499| Distinction|
|        AAA|            2013J|    279572| Distinction|
|        AAA|            2013J|    291334| Distinction|
+-----------+-----------------+----------+------------+
only showing top 5 rows



In [117]:
distinct_stud.join(data_studentAssessment, on = "id_student", how="left").filter(col("date_submitted").isNull()).show()

+----------+-----------+-----------------+------------+-------------+--------------+---------+-----+
|id_student|code_module|code_presentation|final_result|id_assessment|date_submitted|is_banked|score|
+----------+-----------+-----------------+------------+-------------+--------------+---------+-----+
+----------+-----------+-----------------+------------+-------------+--------------+---------+-----+



### 6. Find assessments where more than 80% of students scored below 50.


In [129]:
data_studentAssessmentUpdate = data_studentAssessment.withColumn("Pass 50", col("score") >= 50)
bad_stud = data_studentAssessmentUpdate\
            .filter(col("Pass 50") == "false")
bad_stud.show(5)

+-------------+----------+--------------+---------+-----+-------+
|id_assessment|id_student|date_submitted|is_banked|score|Pass 50|
+-------------+----------+--------------+---------+-----+-------+
|         1752|    141377|            54|        0|   45|  false|
|         1752|    295741|            18|        0|   38|  false|
|         1752|    307866|            19|        0|   47|  false|
|         1752|    324282|            18|        0|   38|  false|
|         1752|    334333|            18|        0|   36|  false|
+-------------+----------+--------------+---------+-----+-------+
only showing top 5 rows



In [136]:
b = bad_stud.groupby("id_assessment").agg(F.count("*").alias("not_passed"))
b.show(5)

+-------------+----------+
|id_assessment|not_passed|
+-------------+----------+
|        15003|        14|
|        15004|        45|
|        34878|         5|
|        14997|       104|
|        34870|        37|
+-------------+----------+
only showing top 5 rows



In [137]:
all = data_studentAssessmentUpdate.groupby("id_assessment").agg(F.count("*").alias("all"))
all.show(5)

+-------------+----+
|id_assessment| all|
+-------------+----+
|        15003|1490|
|        15004|1350|
|        34878|1470|
|        14997|1524|
|        34870| 912|
+-------------+----+
only showing top 5 rows



In [141]:
j = b.join(all, on="id_assessment")
j = j.withColumn("avg", col("not_passed") / col("all") * 100)\
 .filter(col("avg") > 80)

j.show()

+-------------+----------+---+---+
|id_assessment|not_passed|all|avg|
+-------------+----------+---+---+
+-------------+----------+---+---+



### 7. Find students who submitted the same assessment multiple times.


In [147]:
df1 = data_studentAssessment.alias("df1")
df2 = data_studentAssessment.alias("df2")

s1 = df1.join(df2, (df1.id_assessment == df2.id_assessment) 
    & (df1.id_student == df2.id_student)
    & (df1.date_submitted != df2.date_submitted) 
)
s1.show()

+-------------+----------+--------------+---------+-----+-------------+----------+--------------+---------+-----+
|id_assessment|id_student|date_submitted|is_banked|score|id_assessment|id_student|date_submitted|is_banked|score|
+-------------+----------+--------------+---------+-----+-------------+----------+--------------+---------+-----+
+-------------+----------+--------------+---------+-----+-------------+----------+--------------+---------+-----+



In [150]:
s2 = data_studentAssessment.groupBy("id_assessment", "id_student").agg(F.count("date_submitted").alias("count"))\
    .filter(col("count") > 1)
s2.show()

+-------------+----------+-----+
|id_assessment|id_student|count|
+-------------+----------+-----+
+-------------+----------+-----+



                                                                                

In [148]:
s1.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [id_assessment#919, id_student#920], [id_assessment#4758, id_student#4759], Inner, BuildRight, NOT (date_submitted#921 = date_submitted#4760), false
   :- Filter ((isnotnull(id_assessment#919) AND isnotnull(id_student#920)) AND isnotnull(date_submitted#921))
   :  +- FileScan csv [id_assessment#919,id_student#920,date_submitted#921,is_banked#922,score#923] Batched: false, DataFilters: [isnotnull(id_assessment#919), isnotnull(id_student#920), isnotnull(date_submitted#921)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/studen/mickael/python_data_engineer/hadoop/spark/OULAD data..., PartitionFilters: [], PushedFilters: [IsNotNull(id_assessment), IsNotNull(id_student), IsNotNull(date_submitted)], ReadSchema: struct<id_assessment:int,id_student:int,date_submitted:int,is_banked:int,score:int>
   +- BroadcastExchange HashedRelationBroadcastMode(List((shiftleft(cast(input[0, int, false] as bigint), 32)

In [151]:
s2.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (count#4841L > 1)
   +- HashAggregate(keys=[id_assessment#919, id_student#920], functions=[count(date_submitted#921)])
      +- Exchange hashpartitioning(id_assessment#919, id_student#920, 200), ENSURE_REQUIREMENTS, [plan_id=8307]
         +- HashAggregate(keys=[id_assessment#919, id_student#920], functions=[partial_count(date_submitted#921)])
            +- FileScan csv [id_assessment#919,id_student#920,date_submitted#921] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/studen/mickael/python_data_engineer/hadoop/spark/OULAD data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id_assessment:int,id_student:int,date_submitted:int>




### 8. Find courses with the highest dropout rate.


In [153]:
total_students = data_studentInfo.groupby("code_module", "code_presentation")\
    .agg(F.count("id_student").alias("total_students"))

dropouts = data_studentInfo.filter(data_studentInfo.final_result.isin(["Fail", "Withdrawn"]))\
    .groupby("code_module", "code_presentation")\
    .agg(F.count("id_student").alias("dropouts"))

dropout_rate = total_students.join(dropouts, on=["code_module", "code_presentation"], how="left")

dropout_rate = dropout_rate.withColumn("dropout_rate", 
                                       (F.col("dropouts") / F.col("total_students")) * 100)

dropout_rate.orderBy(F.col("dropout_rate").desc()).show(5)

+-----------+-----------------+--------------+--------+-----------------+
|code_module|code_presentation|total_students|dropouts|     dropout_rate|
+-----------+-----------------+--------------+--------+-----------------+
|        CCC|            2014B|          1936|    1273|65.75413223140497|
|        DDD|            2014B|          1228|     749|60.99348534201955|
|        DDD|            2013B|          1303|     793|60.85955487336915|
|        CCC|            2014J|          2498|    1483|59.36749399519615|
|        DDD|            2013J|          1938|    1109|57.22394220846233|
+-----------+-----------------+--------------+--------+-----------------+
only showing top 5 rows



### 9. Calculate the number of interactions (clicks) per student.


### 10. Find the median score per module.


### 11. Calculate submission delay patterns.

### 12. Compare performance across age bands.
