In [0]:
"""
/*
EPAM

Write a SQL query to count the number of candidates got a perfect score in each experience category, 
in which they were requested to solve the task(NULL means the candidate was not requested to solve the 
tasks in that category)

100 -> Perfect score
NULL -> as good as perfect score since candidate was not requested to solve the tasks

Input for dataset 1:
+------+------------+------+------+------------+
| id   | experience | sql_ | algo | bug_fixing |
+------+------------+------+------+------------+
|    1 |          3 |  100 | NULL |         50 |
|    2 |          5 | NULL |  100 |        100 |
|    3 |          1 |  100 |  100 |        100 |
|    4 |          5 |  100 |   50 |       NULL |
|    5 |          5 |  100 |  100 |        100 |
+------+------------+------+------+------------+

Output for datase 1:
+------------+------------------+-----------+
| experience | total_candidates | max_score |
+------------+------------------+-----------+
|          3 |                1 |         0 |
|          5 |                3 |         2 |
|          1 |                1 |         1 |
+------------+------------------+-----------+


Input for dataset 2:
+------+------------+------+------+------------+
| id   | experience | sql_ | algo | bug_fixing |
+------+------------+------+------+------------+
|    1 |          2 | NULL | NULL |       NULL |
|    2 |         20 | NULL | NULL |         20 |
|    3 |          7 |  100 | NULL |        100 |
|    4 |          3 |  100 |   50 |       NULL |
|    5 |          2 |   40 |  100 |        100 |
+------+------------+------+------+------------+
Output for datase 2:
+------------+------------------+-----------+
| experience | total_candidates | max_score |
+------------+------------------+-----------+
|          2 |                2 |         1 |
|         20 |                1 |         0 |
|          7 |                1 |         1 |
|          3 |                1 |         0 |
+------------+------------------+-----------+
*/
"""

from pyspark.sql.functions import *

assessments_df = spark.createDataFrame(
    [
        (1,3,100,None,50),
        (2,5,None,100,100),
        (3,1,100,100,100),
        (4,5,100,50,None),
        (5,5,100,100,100) 
    ],
    ["id", "experience","sql_", "algo", "bug_fixing"]
)

assessments_df.show()

+---+----------+----+----+----------+
| id|experience|sql_|algo|bug_fixing|
+---+----------+----+----+----------+
|  1|         3| 100|NULL|        50|
|  2|         5|NULL| 100|       100|
|  3|         1| 100| 100|       100|
|  4|         5| 100|  50|      NULL|
|  5|         5| 100| 100|       100|
+---+----------+----+----+----------+



In [0]:
assessments_df.withColumn("sql_score", when(col("sql_").isNull() | (col("sql_") == lit(100)), lit(1)).otherwise(lit(0))) \
    .withColumn("algo_score", when(col("algo").isNull() | (col("algo") == lit(100)), lit(1)).otherwise(lit(0))) \
    .withColumn("bug_fixing", when(col("bug_fixing").isNull() | (col("bug_fixing") == lit(100)), lit(1)).otherwise(lit(0))) \
    .withColumn("total_score", when(col("sql_score") + col("algo_score") + col("bug_fixing") == lit(3), lit(1)).otherwise(lit(0))) \
    .groupBy("experience") \
    .agg(count("*").alias("total_candidates"), sum(col("total_score")).alias("max_score_per_experience")) \
    .show()

+----------+----------------+------------------------+
|experience|total_candidates|max_score_per_experience|
+----------+----------------+------------------------+
|         3|               1|                       0|
|         5|               3|                       2|
|         1|               1|                       1|
+----------+----------------+------------------------+



In [0]:
# For input 2

from pyspark.sql.functions import *

assessments_df = spark.createDataFrame(
    [
        (1,2,None,None,None),
        (2,20,None,None,20),
        (3,7,100,None,100),
        (4,3,100,50,None),
        (5,2,40,100,100)
    ],
    ["id", "experience","sql_", "algo", "bug_fixing"]
)

assessments_df.show()

+---+----------+----+----+----------+
| id|experience|sql_|algo|bug_fixing|
+---+----------+----+----+----------+
|  1|         2|NULL|NULL|      NULL|
|  2|        20|NULL|NULL|        20|
|  3|         7| 100|NULL|       100|
|  4|         3| 100|  50|      NULL|
|  5|         2|  40| 100|       100|
+---+----------+----+----+----------+



In [0]:
assessments_df.withColumn("sql_score", when(col("sql_").isNull() | (col("sql_") == lit(100)), lit(1)).otherwise(lit(0))) \
    .withColumn("algo_score", when(col("algo").isNull() | (col("algo") == lit(100)), lit(1)).otherwise(lit(0))) \
    .withColumn("bug_fixing", when(col("bug_fixing").isNull() | (col("bug_fixing") == lit(100)), lit(1)).otherwise(lit(0))) \
    .withColumn("total_score", when(col("sql_score") + col("algo_score") + col("bug_fixing") == lit(3), lit(1)).otherwise(lit(0))) \
    .groupBy("experience") \
    .agg(count("*").alias("total_candidates"), sum(col("total_score")).alias("max_score_per_experience")) \
    .show()

+----------+----------------+------------------------+
|experience|total_candidates|max_score_per_experience|
+----------+----------------+------------------------+
|         2|               2|                       1|
|        20|               1|                       0|
|         7|               1|                       1|
|         3|               1|                       0|
+----------+----------------+------------------------+

