In [0]:
# ------------------------------
# Notebook: 01_explore_tables
# Purpose: Explore all project tables to understand structure and content
# ------------------------------

# List of all your tables
tables = ["student_info", "student_registration", "student_assessment", 
          "assessments", "student_vle", "vle", "courses", "online_classroom_data"]

from pyspark.sql.functions import col, sum

# Loop through all tables
for table in tables:
    print(f"\n=== Table: {table} ===")
    
    # Load table as Spark DataFrame
    df = spark.table(table)
    
    # 1️ Print schema (column names and data types)
    print("Schema:")
    df.printSchema()
    
    # 2️ Display first 5 rows
    print("First 5 rows:")
    display(df.limit(5))
    
    # 3️ Summary statistics for numeric columns
    print("Summary statistics:")
    display(df.describe())
    
    # 4️ Count missing values per column
    print("Missing values per column:")
    df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()
    
    print("-----------------------------------------------------------")



=== Table: student_info ===
Schema:
root
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)
 |-- id_student: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- region: string (nullable = true)
 |-- highest_education: string (nullable = true)
 |-- imd_band: string (nullable = true)
 |-- age_band: string (nullable = true)
 |-- num_of_prev_attempts: long (nullable = true)
 |-- studied_credits: long (nullable = true)
 |-- disability: string (nullable = true)
 |-- final_result: string (nullable = true)

First 5 rows:


code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


Summary statistics:


summary,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
count,32593,32593,32593.0,32593,32593,32593,31482,32593,32593.0,32593.0,32593,32593
mean,,,706687.6691314086,,,,,,0.1632252324118675,79.75869051636855,,
stddev,,,549167.3138552044,,,,,,0.4797584741629903,41.07190026123308,,
min,AAA,2013B,3733.0,F,East Anglian Region,A Level or Equivalent,0-10%,0-35,0.0,30.0,N,Distinction
max,GGG,2014J,2716795.0,M,Yorkshire Region,Post Graduate Qualification,90-100%,55<=,6.0,655.0,Y,Withdrawn


Missing values per column:
+-----------+-----------------+----------+------+------+-----------------+--------+--------+--------------------+---------------+----------+------------+
|code_module|code_presentation|id_student|gender|region|highest_education|imd_band|age_band|num_of_prev_attempts|studied_credits|disability|final_result|
+-----------+-----------------+----------+------+------+-----------------+--------+--------+--------------------+---------------+----------+------------+
|          0|                0|         0|     0|     0|                0|    1111|       0|                   0|              0|         0|           0|
+-----------+-----------------+----------+------+------+-----------------+--------+--------+--------------------+---------------+----------+------------+

-----------------------------------------------------------

=== Table: student_registration ===
Schema:
root
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)


code_module,code_presentation,id_student,date_registration,date_unregistration
AAA,2013J,11391,-159,
AAA,2013J,28400,-53,
AAA,2013J,30268,-92,12.0
AAA,2013J,31604,-52,
AAA,2013J,32885,-176,


Summary statistics:


summary,code_module,code_presentation,id_student,date_registration,date_unregistration
count,32593,32593,32593.0,32548.0,10072.0
mean,,,706687.6691314086,-69.4113002335013,49.757644956314536
stddev,,,549167.3138552044,49.26052211501909,82.46088995813557
min,AAA,2013B,3733.0,-322.0,-365.0
max,GGG,2014J,2716795.0,167.0,444.0


Missing values per column:
+-----------+-----------------+----------+-----------------+-------------------+
|code_module|code_presentation|id_student|date_registration|date_unregistration|
+-----------+-----------------+----------+-----------------+-------------------+
|          0|                0|         0|               45|              22521|
+-----------+-----------------+----------+-----------------+-------------------+

-----------------------------------------------------------

=== Table: student_assessment ===
Schema:
root
 |-- id_assessment: long (nullable = true)
 |-- id_student: long (nullable = true)
 |-- date_submitted: long (nullable = true)
 |-- is_banked: long (nullable = true)
 |-- score: long (nullable = true)

First 5 rows:


id_assessment,id_student,date_submitted,is_banked,score
1752,11391,18,0,78
1752,28400,22,0,70
1752,31604,17,0,72
1752,32885,26,0,69
1752,38053,19,0,79


Summary statistics:


summary,id_assessment,id_student,date_submitted,is_banked,score
count,173912.0,173912.0,173912.0,173912.0,173739.0
mean,26553.80355582133,705150.7172248034,116.03294194765168,0.0109768158608951,75.7995729226023
stddev,8829.78425351934,552395.1909984987,71.48414778039503,0.1041939911849793,18.798107229735606
min,1752.0,6516.0,-11.0,0.0,0.0
max,37443.0,2698588.0,608.0,1.0,100.0


Missing values per column:
+-------------+----------+--------------+---------+-----+
|id_assessment|id_student|date_submitted|is_banked|score|
+-------------+----------+--------------+---------+-----+
|            0|         0|             0|        0|  173|
+-------------+----------+--------------+---------+-----+

-----------------------------------------------------------

=== Table: assessments ===
Schema:
root
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)
 |-- id_assessment: long (nullable = true)
 |-- assessment_type: string (nullable = true)
 |-- date: long (nullable = true)
 |-- weight: double (nullable = true)

First 5 rows:


code_module,code_presentation,id_assessment,assessment_type,date,weight
AAA,2013J,1752,TMA,19,10.0
AAA,2013J,1753,TMA,54,20.0
AAA,2013J,1754,TMA,117,20.0
AAA,2013J,1755,TMA,166,20.0
AAA,2013J,1756,TMA,215,30.0


Summary statistics:


summary,code_module,code_presentation,id_assessment,assessment_type,date,weight
count,206,206,206.0,206,195.0,206.0
mean,,,26473.97572815534,,145.00512820512822,20.87378640776699
stddev,,,10098.625521273689,,76.00111891714978,30.38422395904127
min,AAA,2013B,1752.0,CMA,12.0,0.0
max,GGG,2014J,40088.0,TMA,261.0,100.0


Missing values per column:
+-----------+-----------------+-------------+---------------+----+------+
|code_module|code_presentation|id_assessment|assessment_type|date|weight|
+-----------+-----------------+-------------+---------------+----+------+
|          0|                0|            0|              0|  11|     0|
+-----------+-----------------+-------------+---------------+----+------+

-----------------------------------------------------------

=== Table: student_vle ===
Schema:
root
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)
 |-- id_student: long (nullable = true)
 |-- id_site: long (nullable = true)
 |-- date: long (nullable = true)
 |-- sum_click: long (nullable = true)

First 5 rows:


code_module,code_presentation,id_student,id_site,date,sum_click
AAA,2013J,28400,546652,-10,4
AAA,2013J,28400,546652,-10,1
AAA,2013J,28400,546652,-10,1
AAA,2013J,28400,546614,-10,11
AAA,2013J,28400,546714,-10,1


Summary statistics:


summary,code_module,code_presentation,id_student,id_site,date,sum_click
count,10655280,10655280,10655280.0,10655280.0,10655280.0,10655280.0
mean,,,733333.5668717293,738323.416399569,95.17399955702712,3.716945870967258
stddev,,,582705.9825107501,131219.62216193098,76.0713008405041,8.84904665510146
min,AAA,2013B,6516.0,526721.0,-25.0,1.0
max,GGG,2014J,2698588.0,1049562.0,269.0,6977.0


Missing values per column:
+-----------+-----------------+----------+-------+----+---------+
|code_module|code_presentation|id_student|id_site|date|sum_click|
+-----------+-----------------+----------+-------+----+---------+
|          0|                0|         0|      0|   0|        0|
+-----------+-----------------+----------+-------+----+---------+

-----------------------------------------------------------

=== Table: vle ===
Schema:
root
 |-- id_site: long (nullable = true)
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)
 |-- activity_type: string (nullable = true)
 |-- week_from: long (nullable = true)
 |-- week_to: long (nullable = true)

First 5 rows:


id_site,code_module,code_presentation,activity_type,week_from,week_to
546943,AAA,2013J,resource,,
546712,AAA,2013J,oucontent,,
546998,AAA,2013J,resource,,
546888,AAA,2013J,url,,
547035,AAA,2013J,resource,,


Summary statistics:


summary,id_site,code_module,code_presentation,activity_type,week_from,week_to
count,6364.0,6364,6364,6364,1121.0,1121.0
mean,726099.094123193,,,,15.2042818911686,15.214986619090098
stddev,128315.13747347445,,,,8.792865383311517,8.779805777261137
min,526721.0,AAA,2013B,dataplus,0.0,0.0
max,1077905.0,GGG,2014J,url,29.0,29.0


Missing values per column:
+-------+-----------+-----------------+-------------+---------+-------+
|id_site|code_module|code_presentation|activity_type|week_from|week_to|
+-------+-----------+-----------------+-------------+---------+-------+
|      0|          0|                0|            0|     5243|   5243|
+-------+-----------+-----------------+-------------+---------+-------+

-----------------------------------------------------------

=== Table: courses ===
Schema:
root
 |-- code_module: string (nullable = true)
 |-- code_presentation: string (nullable = true)
 |-- module_presentation_length: long (nullable = true)

First 5 rows:


code_module,code_presentation,module_presentation_length
AAA,2013J,268
AAA,2014J,269
BBB,2013J,268
BBB,2014J,262
BBB,2013B,240


Summary statistics:


summary,code_module,code_presentation,module_presentation_length
count,22,22,22.0
mean,,,255.54545454545453
stddev,,,13.654677456835678
min,AAA,2013B,234.0
max,GGG,2014J,269.0


Missing values per column:
+-----------+-----------------+--------------------------+
|code_module|code_presentation|module_presentation_length|
+-----------+-----------------+--------------------------+
|          0|                0|                         0|
+-----------+-----------------+--------------------------+

-----------------------------------------------------------

=== Table: online_classroom_data ===
Schema:
root
 |-- total_posts: double (nullable = true)
 |-- helpful_post: double (nullable = true)
 |-- nice_code_post: double (nullable = true)
 |-- collaborative_post: double (nullable = true)
 |-- confused_post: double (nullable = true)
 |-- creative_post: double (nullable = true)
 |-- bad_post: double (nullable = true)
 |-- amazing_post: double (nullable = true)
 |-- timeonline: double (nullable = true)
 |-- sk1_classroom: string (nullable = true)
 |-- sk2_classroom: string (nullable = true)
 |-- sk5_classroom: string (nullable = true)
 |-- sk3_classroom: string (null

total_posts,helpful_post,nice_code_post,collaborative_post,confused_post,creative_post,bad_post,amazing_post,timeonline,sk1_classroom,sk2_classroom,sk5_classroom,sk3_classroom,sk4_classroom,Approved
1.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,1600.0,21,24,35,36,17,0
1.0,0.0,0.0,1.0,0.0,2.0,0.0,3.0,592.0,3,3,0,1,2,0
2.0,4.0,3.0,9.0,0.0,16.0,1.0,8.0,1110.0,8,5,5,7,5,1
5.0,1.0,3.0,9.0,2.0,11.0,0.0,8.0,8651.0,6,5,4,6,4,1
14.0,6.0,15.0,28.0,0.0,50.0,0.0,45.0,34172.0,87,9,65,10,88,1


Summary statistics:


summary,total_posts,helpful_post,nice_code_post,collaborative_post,confused_post,creative_post,bad_post,amazing_post,timeonline,sk1_classroom,sk2_classroom,sk5_classroom,sk3_classroom,sk4_classroom,Approved
count,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0
mean,5.507042253521127,4.028169014084507,7.47887323943662,11.366197183098592,0.5492957746478874,18.309859154929576,0.056338028169014,18.95774647887324,10279.12676056338,5.307692307692308,5.108108108108108,4.721311475409836,5.777777777777778,5.909090909090909,0.704225352112676
stddev,6.480461269818507,6.843081659573966,12.746830591988155,13.970621661819717,1.3500130411472346,21.93079475421052,0.2872193757720974,22.086477194181388,10176.24337215932,3.749763825221433,3.64983853359588,3.60615745423012,3.66536773529214,4.488621979263487,0.4596385597769218
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,46.0,53.0,90.0,89.0,7.0,150.0,2.0,155.0,43612.0,93.0,93.0,9.0,97.0,98.0,1.0


Missing values per column:
+-----------+------------+--------------+------------------+-------------+-------------+--------+------------+----------+-------------+-------------+-------------+-------------+-------------+--------+
|total_posts|helpful_post|nice_code_post|collaborative_post|confused_post|creative_post|bad_post|amazing_post|timeonline|sk1_classroom|sk2_classroom|sk5_classroom|sk3_classroom|sk4_classroom|Approved|
+-----------+------------+--------------+------------------+-------------+-------------+--------+------------+----------+-------------+-------------+-------------+-------------+-------------+--------+
|          0|           0|             0|                 0|            0|            0|       0|           0|         0|            0|            0|            0|            0|            0|       0|
+-----------+------------+--------------+------------------+-------------+-------------+--------+------------+----------+-------------+-------------+-------------+------