In [0]:
"""
Problem Statement : 
Write a solution to find the students who meet the following criteria:

1. Have taken all mandatory courses and at least two elective courses offered in their major.
2. Achieved a grade of A in all mandatory courses and at least B in elective courses.
3. Maintained an average GPA of at least 2.5 across all their courses (including those outside their major).

Return the result table ordered by student_id in ascending order.


+----------+-------+----------------+
|student_id|   name|           major|
+----------+-------+----------------+
|         1|  Alice|Computer Science|
|         2|    Bob|Computer Science|
|         3|Charlie|     Mathematics|
|         4|  David|     Mathematics|
+----------+-------+----------------+

+---------+-----------------+-------+----------------+---------+
|course_id|             name|credits|           major|mandatory|
+---------+-----------------+-------+----------------+---------+
|      101|       Algorithms|      3|Computer Science|     true|
|      102|  Data Structures|      3|Computer Science|     true|
|      103|         Calculus|      4|     Mathematics|     true|
|      104|   Linear Algebra|      4|     Mathematics|     true|
|      105| Machine Learning|      3|Computer Science|    false|
|      106|      Probability|      3|     Mathematics|    false|
|      107|Operating Systems|      3|Computer Science|    false|
|      108|       Statistics|      3|     Mathematics|    false|
+---------+-----------------+-------+----------------+---------+

+----------+---------+-----------+-----+---+
|student_id|course_id|   semester|grade|gpa|
+----------+---------+-----------+-----+---+
|         1|      101|  Fall 2023|    A|4.0|
|         1|      102|Spring 2023|    A|4.0|
|         1|      105|Spring 2023|    A|4.0|
|         1|      107|  Fall 2023|    B|3.5|
|         2|      101|  Fall 2023|    A|4.0|
|         2|      102|Spring 2023|    B|3.0|
|         3|      103|  Fall 2023|    A|4.0|
|         3|      104|Spring 2023|    A|4.0|
|         3|      106|Spring 2023|    A|4.0|
|         3|      108|  Fall 2023|    B|3.5|
|         4|      103|  Fall 2023|    B|3.0|
|         4|      104|Spring 2023|    B|3.0|
+----------+---------+-----------+-----+---+


Output
+----------+
|student_id|
+----------+
|         1|
|         3|
+----------+
"""
students_df = spark.createDataFrame([
  (1, 'Alice', 'Computer Science'),
  (2, 'Bob', 'Computer Science'),
  (3, 'Charlie', 'Mathematics'),
  (4, 'David', 'Mathematics')
], ["student_id", "name", "major"])

courses_df = spark.createDataFrame([
  (101, 'Algorithms', 3, 'Computer Science', True),
  (102, 'Data Structures', 3, 'Computer Science', True),
  (103, 'Calculus', 4, 'Mathematics', True),
  (104, 'Linear Algebra', 4, 'Mathematics', True),
  (105, 'Machine Learning', 3, 'Computer Science', False),
  (106, 'Probability', 3, 'Mathematics', False),
  (107, 'Operating Systems', 3, 'Computer Science', False),
  (108, 'Statistics', 3, 'Mathematics', False)
], ["course_id", "name", "credits", "major", "mandatory"]) 

enrollments_df =  spark.createDataFrame([
  (1, 101, 'Fall 2023', 'A', 4.0),
  (1, 102, 'Spring 2023', 'A', 4.0),
  (1, 105, 'Spring 2023', 'A', 4.0),
  (1, 107, 'Fall 2023', 'B', 3.5),
  (2, 101, 'Fall 2023', 'A', 4.0),
  (2, 102, 'Spring 2023', 'B', 3.0),
  (3, 103, 'Fall 2023', 'A', 4.0),
  (3, 104, 'Spring 2023', 'A', 4.0),
  (3, 106, 'Spring 2023', 'A', 4.0),
  (3, 108, 'Fall 2023', 'B', 3.5),
  (4, 103, 'Fall 2023', 'B', 3.0),
  (4, 104, 'Spring 2023', 'B', 3.0)
], ["student_id", "course_id", "semester", "grade",  "gpa"])

students_df.show()
courses_df.show()
enrollments_df.show()


+----------+-------+----------------+
|student_id|   name|           major|
+----------+-------+----------------+
|         1|  Alice|Computer Science|
|         2|    Bob|Computer Science|
|         3|Charlie|     Mathematics|
|         4|  David|     Mathematics|
+----------+-------+----------------+

+---------+-----------------+-------+----------------+---------+
|course_id|             name|credits|           major|mandatory|
+---------+-----------------+-------+----------------+---------+
|      101|       Algorithms|      3|Computer Science|     true|
|      102|  Data Structures|      3|Computer Science|     true|
|      103|         Calculus|      4|     Mathematics|     true|
|      104|   Linear Algebra|      4|     Mathematics|     true|
|      105| Machine Learning|      3|Computer Science|    false|
|      106|      Probability|      3|     Mathematics|    false|
|      107|Operating Systems|      3|Computer Science|    false|
|      108|       Statistics|      3|     Mat

In [0]:
students_df.createOrReplaceTempView("students")
courses_df.createOrReplaceTempView("courses")
enrollments_df.createOrReplaceTempView("enrollments")

spark.sql("""
          with valid_students as (
            select student_id from enrollments group by student_id having avg(gpa) >= 2.5       
          ), status_students as (
            select
                vc.student_id,
                c.mandatory,
                e.grade
            from valid_students vc 
            inner join students s on vc.student_id=s.student_id
            inner join courses c on s.major=c.major
            left join enrollments e on vc.student_id=e.student_id and c.course_id=e.course_id -- mandatory , optional and selected; courses need all details
            order by s.student_id
          ), aggregated_reports_students as (
            select
                student_id,
                sum(case when mandatory='true' then 1 else 0 end) as total_mandatory_courses,
                sum(case when mandatory='true' and grade='A' then 1 else 0 end) as total_grade_A_mandatory_courses,
                sum(case when mandatory='false' and grade is not null then 1 else 0 end) as total_optional_courses,
                sum(case when mandatory='false' and grade in ('A', 'B') then 1 else 0 end) as total_optional_good_grades 
            from status_students
            group by student_id
          )
          select
            student_id 
          from aggregated_reports_students
          where total_mandatory_courses=total_grade_A_mandatory_courses 
          and total_optional_courses >= 2
          and total_optional_courses=total_optional_good_grades
          """).show()


+----------+
|student_id|
+----------+
|         1|
|         3|
+----------+

