In [0]:
from pyspark.sql.functions import col, sum, avg, date_add, lit
from pyspark.sql.types import IntegerType

# ------------------------------
# Notebook: 02_clean_transform
# Purpose: Clean and transform all tables, create detailed master datasets
# ------------------------------

# 0️ Load tables
tables = ["student_info", "student_registration", "student_assessment", 
          "assessments", "student_vle", "vle", "courses", "online_classroom_data"]

dfs = {table: spark.table(table).dropDuplicates() for table in tables}

# 1️ Handle missing values
dfs["student_info"] = dfs["student_info"].dropna(subset=["id_student", "code_module"])
dfs["student_registration"] = dfs["student_registration"].dropna(subset=["id_student", "code_module"])
dfs["student_vle"] = dfs["student_vle"].fillna({"sum_click": 0})
dfs["student_assessment"] = dfs["student_assessment"].dropna(subset=["id_student", "id_assessment"])
dfs["assessments"] = dfs["assessments"].dropna(subset=["id_assessment"])
dfs["vle"] = dfs["vle"].dropna(subset=["id_site"])
dfs["courses"] = dfs["courses"].dropna(subset=["code_module"])
dfs["online_classroom_data"] = dfs["online_classroom_data"].fillna(0)

# 2️ Fix data types
dfs["student_vle"] = dfs["student_vle"].withColumn("sum_click", col("sum_click").cast("int"))
dfs["student_assessment"] = dfs["student_assessment"].withColumn("score", col("score").cast("float"))

# 3️ Fix date columns
reference_date = "2020-01-01"
dfs["student_vle"] = dfs["student_vle"].withColumn(
    "vle_date",
    date_add(lit(reference_date), col("date").cast(IntegerType()))
).drop("date")

dfs["assessments"] = dfs["assessments"].withColumnRenamed("date", "assessment_date")

# 4️ Rename overlapping columns
dfs["student_vle"] = dfs["student_vle"].withColumnRenamed("code_module", "vle_code_module") \
                                       .withColumnRenamed("code_presentation", "vle_code_presentation")
dfs["assessments"] = dfs["assessments"].withColumnRenamed("code_module", "assessment_code_module") \
                                       .withColumnRenamed("code_presentation", "assessment_code_presentation")
dfs["courses"] = dfs["courses"].withColumnRenamed("code_module", "course_code_module") \
                               .withColumnRenamed("code_presentation", "course_code_presentation")

# 5️ Create master table
df_master = dfs["student_info"] \
    .join(dfs["student_vle"], on="id_student", how="left") \
    .join(dfs["student_assessment"], on="id_student", how="left") \
    .join(dfs["assessments"], on="id_assessment", how="left") \
    .join(dfs["courses"], 
          (dfs["student_info"]["code_module"] == dfs["courses"]["course_code_module"]) & 
          (dfs["student_info"]["code_presentation"] == dfs["courses"]["course_code_presentation"]),
          how="left") \
    .dropDuplicates()

# 6️ Save cleaned tables and master table
dfs["student_vle"].write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("student_vle_clean")
dfs["student_info"].write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("student_info_clean")
dfs["online_classroom_data"].write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("online_classroom_data_clean")
df_master.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("student_master")

# 7️ Display master table for verification
display(df_master.limit(10))


id_assessment,id_student,code_module,code_presentation,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,vle_code_module,vle_code_presentation,id_site,sum_click,vle_date,date_submitted,is_banked,score,assessment_code_module,assessment_code_presentation,assessment_type,assessment_date,weight,course_code_module,course_code_presentation,module_presentation_length
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,882549,3,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,883037,1,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,883281,2,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,882549,5,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,882537,10,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,883142,1,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,883092,1,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,883060,2,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,882549,1,2019-12-28,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
34910,683902,FFF,2014J,M,East Midlands Region,A Level or Equivalent,0-10%,0-35,0,60,N,Pass,FFF,2014J,882654,5,2019-12-31,212,0,78.0,FFF,2014J,CMA,241,0.0,FFF,2014J,269
