In [0]:
# ------------------------------
# Notebook: 04_data_warehouse_design
# Purpose: Create star schema data warehouse with facts and dimensions
# ------------------------------

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from functools import reduce

print("=== STARTING DATA WAREHOUSE DESIGN ===")

# 0️ Load cleaned tables
print("Loading cleaned tables...")
student_info_clean = spark.table("student_info_clean")
student_vle_clean = spark.table("student_vle_clean")
courses_clean = spark.table("courses")
assessments_clean = spark.table("assessments")
vle_clean = spark.table("vle")
online_classroom_clean = spark.table("online_classroom_data_clean")
student_assessment_clean = spark.table("student_assessment")

print(" Tables loaded successfully")

# Vérifier la structure de online_classroom_data_clean
print("=== CHECKING ONLINE CLASSROOM DATA STRUCTURE ===")
print("online_classroom_data_clean columns:", online_classroom_clean.columns)
print("online_classroom_data_clean schema:")
online_classroom_clean.printSchema()
display(online_classroom_clean.limit(3))

# 1️ Create Dimension Tables
print("\n=== CREATING DIMENSION TABLES ===")

# dim_student
print("Creating dim_student...")
dim_student = student_info_clean.select(
    col("id_student"),
    col("gender"),
    col("region"),
    col("highest_education"),
    col("imd_band"),
    col("age_band"),
    col("disability"),
    col("num_of_prev_attempts"),
    col("studied_credits"),
    when(col("final_result") == "Pass", "Graduated")
      .when(col("final_result") == "Fail", "Failed")
      .when(col("final_result") == "Withdrawn", "Dropped Out")
      .when(col("final_result") == "Distinction", "Graduated")
      .otherwise("Unknown").alias("academic_status"),
    lit("Active").alias("student_status")
).distinct()

print(f"dim_student: approximately {dim_student.count():,} rows")
display(dim_student.limit(5))

# dim_course
print("Creating dim_course...")
dim_course = courses_clean.select(
    col("code_module"),
    col("code_presentation"),
    col("module_presentation_length").alias("length"),
    when(col("code_presentation").contains("B"), "February")
      .when(col("code_presentation").contains("J"), "October")
      .otherwise("Unknown").alias("start_month"),
    when(col("code_module").isin(["CCC", "EEE", "GGG"]), "Special")
      .otherwise("Regular").alias("module_category"),
    when(col("module_presentation_length") < 200, "Short")
      .when(col("module_presentation_length") < 300, "Medium")
      .otherwise("Long").alias("duration_category")
).distinct()

print(f"dim_course: approximately {dim_course.count():,} rows")
display(dim_course.limit(5))

# dim_time
print("Creating dim_time...")
min_date = student_vle_clean.agg(min("date").alias("min_date")).collect()[0]["min_date"]

dim_time = student_vle_clean.select(
    datediff(col("date"), lit(min_date)).alias("relative_day")
).distinct().filter(col("relative_day").isNotNull()).select(
    col("relative_day").alias("time_id"),
    col("relative_day").alias("date"),
    when(col("relative_day") < 0, "Before Start")
      .when(col("relative_day") == 0, "Start Day")
      .when(col("relative_day") <= 30, "First Month")
      .when(col("relative_day") <= 60, "Second Month")
      .when(col("relative_day") <= 90, "Third Month")
      .otherwise("Final Period").alias("academic_period"),
    when(col("relative_day") < 0, "Pre-Course")
      .when(col("relative_day") <= 100, "Early Course")
      .when(col("relative_day") <= 200, "Mid Course")
      .otherwise("Late Course").alias("course_phase"),
    when(col("relative_day") < 0, "Preparation")
      .when(col("relative_day") <= 30, "Beginning")
      .when(col("relative_day") <= 100, "Core Period")
      .otherwise("Final Stage").alias("learning_stage"),
    lit("Academic Day").alias("day_type")
).distinct().orderBy("time_id")

print(f"dim_time: approximately {dim_time.count():,} rows")
display(dim_time.limit(10))

# dim_activity
print("Creating dim_activity...")
dim_activity = vle_clean.select(
    col("id_site"),
    col("code_module"),
    col("code_presentation"),
    col("activity_type"),
    col("week_from"),
    col("week_to"),
    when(col("activity_type").isin(["forum", "discussion", "glossary"]), "Social")
      .when(col("activity_type").isin(["resource", "file", "url", "page"]), "Content")
      .when(col("activity_type").isin(["quiz", "assignment", "subpage", "question"]), "Assessment")
      .when(col("activity_type").isin(["homepage", "folder"]), "Navigation")
      .otherwise("Other").alias("activity_category"),
    when((col("week_to").cast("int") - col("week_from").cast("int")) <= 1, "Short-term")
      .when((col("week_to").cast("int") - col("week_from").cast("int")) <= 4, "Medium-term")
      .otherwise("Long-term").alias("availability_duration")
).distinct()

print(f"dim_activity: approximately {dim_activity.count():,} rows")
display(dim_activity.limit(5))

# dim_assessment
print("Creating dim_assessment...")
dim_assessment = assessments_clean.select(
    col("id_assessment"),
    col("code_module"),
    col("code_presentation"),
    col("assessment_type"),
    col("date").alias("assessment_date"),
    col("weight"),
    when(col("assessment_type") == "Exam", "Final")
      .when(col("assessment_type") == "TMA", "Tutor Marked")
      .when(col("assessment_type") == "CMA", "Computer Marked")
      .otherwise("Other").alias("assessment_category"),
    when(col("weight") == 100, "High Stakes")
      .when((col("weight") >= 50) & (col("weight") < 100), "Medium Stakes")
      .otherwise("Low Stakes").alias("importance_level")
).distinct()

print(f"dim_assessment: approximately {dim_assessment.count():,} rows")
display(dim_assessment.limit(5))

# 2️ Create Fact Tables
print("\n=== CREATING FACT TABLES ===")

# fact_student_performance
print("Creating fact_student_performance...")
fact_performance = student_info_clean.join(
    dim_course,
    on=["code_module", "code_presentation"],
    how="left"
).select(
    col("id_student"),
    col("code_module"),
    col("code_presentation"),
    col("final_result"),
    when(col("final_result").isin("Pass", "Distinction"), 1).otherwise(0).alias("is_passed"),
    when(col("final_result") == "Fail", 1).otherwise(0).alias("is_failed"),
    when(col("final_result") == "Withdrawn", 1).otherwise(0).alias("is_withdrawn"),
    when(col("final_result") == "Distinction", 1).otherwise(0).alias("is_distinction"),
    coalesce(col("num_of_prev_attempts").cast("int"), lit(0)).alias("num_of_prev_attempts"),
    coalesce(col("studied_credits").cast("int"), lit(0)).alias("studied_credits"),
    current_timestamp().alias("load_timestamp")
)

print(f"fact_performance: approximately {fact_performance.count():,} rows")
display(fact_performance.limit(5))

# fact_student_engagement
print("Creating fact_student_engagement...")
min_date = student_vle_clean.agg(min("date").alias("min_date")).collect()[0]["min_date"]

eng_joined = student_vle_clean.select(
    "id_student", "code_module", "code_presentation", "id_site", "sum_click", "date",
    datediff(col("date"), lit(min_date)).alias("relative_day")
).join(
    dim_activity.select("id_site", "activity_category"),
    on="id_site",
    how="left"
)

fact_engagement_agg = eng_joined.groupBy(
    "id_student", "code_module", "code_presentation", "relative_day"
).agg(
    sum(col("sum_click").cast("double")).alias("daily_total_clicks"),
    countDistinct("id_site").alias("daily_resources_accessed"),
    sum(when(col("activity_category") == "Social", col("sum_click").cast("double")).otherwise(0.0)).alias("social_clicks"),
    sum(when(col("activity_category") == "Content", col("sum_click").cast("double")).otherwise(0.0)).alias("content_clicks"),
    sum(when(col("activity_category") == "Assessment", col("sum_click").cast("double")).otherwise(0.0)).alias("assessment_clicks")
)

fact_engagement = fact_engagement_agg.withColumn(
    "daily_engagement_level",
    when(col("daily_total_clicks") > 100, lit("High"))
      .when(col("daily_total_clicks") > 50, lit("Medium"))
      .otherwise(lit("Low"))
).withColumn("load_timestamp", current_timestamp())

print(f"fact_engagement: approximately {fact_engagement.count():,} rows")
display(fact_engagement.limit(5))

# fact_assessment_scores
print("Creating fact_assessment_scores...")
sa_clean = student_assessment_clean.withColumn("score_num", regexp_replace(col("score"), ",", ".").cast("double"))

fact_assessment_scores = sa_clean.join(
    dim_assessment.select("id_assessment", "assessment_type", "assessment_category", "importance_level"),
    on="id_assessment",
    how="left"
).select(
    col("id_student"),
    col("id_assessment"),
    col("score_num").alias("score"),
    col("date_submitted"),
    col("assessment_type"),
    col("assessment_category"),
    col("importance_level"),
    when(col("score_num") >= 40, 1).otherwise(0).alias("is_passed_assessment"),
    when(col("score_num") >= 70, lit("High"))
      .when(col("score_num") >= 40, lit("Medium"))
      .otherwise(lit("Low")).alias("performance_level"),
    current_timestamp().alias("load_timestamp")
)

print(f"fact_assessment_scores: approximately {fact_assessment_scores.count():,} rows")
display(fact_assessment_scores.limit(5))

# ---------------------------
# FIXED: fact_21st_century_skills
# ---------------------------
print("Creating fact_21st_century_skills...")

skill_columns = [c for c in online_classroom_clean.columns if 'sk' in c.lower() or 'skill' in c.lower()]

if skill_columns:
    online_classroom_numeric = online_classroom_clean.select(
        [regexp_replace(col(c), ",", ".").cast("double").alias(c) for c in skill_columns]
    )
    fact_skills = online_classroom_numeric.withColumn("row_id", monotonically_increasing_id())
    
    sum_expr = reduce(lambda a, b: a + b, [coalesce(col(c), lit(0.0)) for c in skill_columns])
    
    fact_skills = fact_skills.withColumn(
        "overall_skills_score", sum_expr / len(skill_columns)
    ).withColumn(
        "skills_level",
        when(col("overall_skills_score") >= 8, "Excellent")
        .when(col("overall_skills_score") >= 6, "Good")
        .otherwise("Needs Improvement")
    ).withColumn("load_timestamp", current_timestamp())
else:
    fact_skills = spark.createDataFrame([], StructType([
        StructField("row_id", LongType(), True),
        StructField("overall_skills_score", DoubleType(), True),
        StructField("skills_level", StringType(), True),
        StructField("load_timestamp", TimestampType(), True)
    ]))

print(f"fact_skills: approximately {fact_skills.count():,} rows")
display(fact_skills.limit(5))


=== STARTING DATA WAREHOUSE DESIGN ===
Loading cleaned tables...
 Tables loaded successfully
=== CHECKING ONLINE CLASSROOM DATA STRUCTURE ===
online_classroom_data_clean columns: ['total_posts', 'helpful_post', 'nice_code_post', 'collaborative_post', 'confused_post', 'creative_post', 'bad_post', 'amazing_post', 'timeonline', 'sk1_classroom', 'sk2_classroom', 'sk5_classroom', 'sk3_classroom', 'sk4_classroom', 'Approved']
online_classroom_data_clean schema:
root
 |-- total_posts: double (nullable = true)
 |-- helpful_post: double (nullable = true)
 |-- nice_code_post: double (nullable = true)
 |-- collaborative_post: double (nullable = true)
 |-- confused_post: double (nullable = true)
 |-- creative_post: double (nullable = true)
 |-- bad_post: double (nullable = true)
 |-- amazing_post: double (nullable = true)
 |-- timeonline: double (nullable = true)
 |-- sk1_classroom: string (nullable = true)
 |-- sk2_classroom: string (nullable = true)
 |-- sk5_classroom: string (nullable = true)
 

total_posts,helpful_post,nice_code_post,collaborative_post,confused_post,creative_post,bad_post,amazing_post,timeonline,sk1_classroom,sk2_classroom,sk5_classroom,sk3_classroom,sk4_classroom,Approved
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0
8.0,4.0,19.0,8.0,0.0,26.0,0.0,25.0,3377.0,7,8,8,7,10,1
2.0,5.0,1.0,7.0,2.0,4.0,0.0,2.0,1655.0,8,8,9,8,88,1



=== CREATING DIMENSION TABLES ===
Creating dim_student...
dim_student: approximately 30,145 rows


id_student,gender,region,highest_education,imd_band,age_band,disability,num_of_prev_attempts,studied_credits,academic_status,student_status
155984,F,East Anglian Region,Lower Than A Level,70-80%,0-35,N,0,60,Dropped Out,Active
551938,F,North Region,Lower Than A Level,10-20,0-35,N,0,60,Graduated,Active
498755,F,London Region,A Level or Equivalent,0-10%,0-35,N,0,120,Graduated,Active
2271644,M,South Region,A Level or Equivalent,40-50%,35-55,N,0,180,Graduated,Active
627388,F,South East Region,A Level or Equivalent,50-60%,0-35,N,0,60,Graduated,Active


Creating dim_course...
dim_course: approximately 22 rows


code_module,code_presentation,length,start_month,module_category,duration_category
AAA,2013J,268,October,Regular,Medium
AAA,2014J,269,October,Regular,Medium
BBB,2013J,268,October,Regular,Medium
BBB,2014J,262,October,Regular,Medium
BBB,2013B,240,February,Regular,Medium


Creating dim_time...
dim_time: approximately 0 rows


time_id,date,academic_period,course_phase,learning_stage,day_type


Creating dim_activity...
dim_activity: approximately 6,364 rows


id_site,code_module,code_presentation,activity_type,week_from,week_to,activity_category,availability_duration
546943,AAA,2013J,resource,,,Content,Long-term
546712,AAA,2013J,oucontent,,,Other,Long-term
546998,AAA,2013J,resource,,,Content,Long-term
546888,AAA,2013J,url,,,Content,Long-term
547035,AAA,2013J,resource,,,Content,Long-term


Creating dim_assessment...
dim_assessment: approximately 206 rows


id_assessment,code_module,code_presentation,assessment_type,assessment_date,weight,assessment_category,importance_level
1752,AAA,2013J,TMA,19,10.0,Tutor Marked,Low Stakes
1753,AAA,2013J,TMA,54,20.0,Tutor Marked,Low Stakes
1754,AAA,2013J,TMA,117,20.0,Tutor Marked,Low Stakes
1755,AAA,2013J,TMA,166,20.0,Tutor Marked,Low Stakes
1756,AAA,2013J,TMA,215,30.0,Tutor Marked,Low Stakes



=== CREATING FACT TABLES ===
Creating fact_student_performance...
fact_performance: approximately 31,284 rows


id_student,code_module,code_presentation,final_result,is_passed,is_failed,is_withdrawn,is_distinction,num_of_prev_attempts,studied_credits,load_timestamp
155984,AAA,2013J,Withdrawn,0,0,1,0,0,60,2025-11-11T00:07:57.448Z
551938,BBB,2013B,Pass,1,0,0,0,0,60,2025-11-11T00:07:57.448Z
498755,BBB,2013J,Pass,1,0,0,0,0,120,2025-11-11T00:07:57.448Z
2271644,BBB,2013J,Pass,1,0,0,0,0,180,2025-11-11T00:07:57.448Z
627388,BBB,2014B,Distinction,1,0,0,1,0,60,2025-11-11T00:07:57.448Z


Creating fact_student_engagement...
fact_engagement: approximately 26,074 rows


id_student,code_module,code_presentation,relative_day,daily_total_clicks,daily_resources_accessed,social_clicks,content_clicks,assessment_clicks,daily_engagement_level,load_timestamp
555132,,,,597.0,104,9.0,67.0,125.0,High,2025-11-11T00:07:59.711Z
569737,,,,4004.0,188,4.0,221.0,515.0,High,2025-11-11T00:07:59.711Z
68376,,,,1049.0,159,0.0,118.0,234.0,High,2025-11-11T00:07:59.711Z
2031252,,,,2503.0,140,0.0,72.0,856.0,High,2025-11-11T00:07:59.711Z
600273,,,,161.0,14,0.0,15.0,8.0,High,2025-11-11T00:07:59.711Z


Creating fact_assessment_scores...
fact_assessment_scores: approximately 173,912 rows


id_student,id_assessment,score,date_submitted,assessment_type,assessment_category,importance_level,is_passed_assessment,performance_level,load_timestamp
11391,1752,78.0,18,TMA,Tutor Marked,Low Stakes,1,High,2025-11-11T00:08:03.145Z
28400,1752,70.0,22,TMA,Tutor Marked,Low Stakes,1,High,2025-11-11T00:08:03.145Z
31604,1752,72.0,17,TMA,Tutor Marked,Low Stakes,1,High,2025-11-11T00:08:03.145Z
32885,1752,69.0,26,TMA,Tutor Marked,Low Stakes,1,Medium,2025-11-11T00:08:03.145Z
38053,1752,79.0,19,TMA,Tutor Marked,Low Stakes,1,High,2025-11-11T00:08:03.145Z


Creating fact_21st_century_skills...
fact_skills: approximately 63 rows


sk1_classroom,sk2_classroom,sk5_classroom,sk3_classroom,sk4_classroom,row_id,overall_skills_score,skills_level,load_timestamp
0.0,0.0,0.0,0.0,0.0,0,0.0,Needs Improvement,2025-11-11T00:08:04.411Z
7.0,8.0,8.0,7.0,10.0,1,8.0,Excellent,2025-11-11T00:08:04.411Z
8.0,8.0,9.0,8.0,8.8,2,8.36,Excellent,2025-11-11T00:08:04.411Z
4.3,4.9,4.0,6.7,9.8,3,5.94,Needs Improvement,2025-11-11T00:08:04.411Z
9.0,9.0,9.0,9.3,8.6,4,8.98,Excellent,2025-11-11T00:08:04.411Z
