In [13]:
// Given a table of candidates and their skills, you're tasked with finding the candidates best suited for an open Data Science job. 
// You want to find candidates who are proficient in Python, Tableau, and PostgreSQL.
// Write a query to list the candidates who possess all of the required skills for the job. Sort the output by candidate ID in ascending order.
// Assumption: There are no duplicates in the candidates table.
// Example Output:
// candidate_id
// 123

// Explanation
// Candidate 123 is displayed because they have Python, Tableau, and PostgreSQL skills. 
// 345 isn't included in the output because they're missing one of the required skills: PostgreSQL.



import org.apache.spark.sql.types.{IntegerType, StringType, StructType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row


val schema = StructType(Array(
                StructField("candidate_id", IntegerType),
                StructField("skills", StringType)
            ))

val data = Seq(
  Row(123,"Python"),
  Row(123,"Tableau"),
  Row(123,"PostgreSQL"),
  Row(234,"R"),
  Row(234,"PowerBI"),
  Row(234,"SQL Server"),
  Row(345,"Python"),
  Row(345,"Tableau")
)

val rdd = spark.sparkContext.parallelize(data)
val df = spark.createDataFrame(rdd, schema)

df.show(false)

println("Using Dataframes -------- ")
val df1 = df.where($"skills".isin("Python","Tableau","PostgreSQL")
        ).groupBy($"candidate_id").agg(count($"candidate_id").as("total")
                                      ).select("candidate_id").where($"total"===3
                                                                    ).orderBy($"candidate_id".asc_nulls_last)
df1.explain()
df1.show(false)



+------------+----------+
|candidate_id|skills    |
+------------+----------+
|123         |Python    |
|123         |Tableau   |
|123         |PostgreSQL|
|234         |R         |
|234         |PowerBI   |
|234         |SQL Server|
|345         |Python    |
|345         |Tableau   |
+------------+----------+

Using Dataframes -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [candidate_id#185 ASC NULLS LAST], true, 0
   +- Exchange rangepartitioning(candidate_id#185 ASC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=447]
      +- Project [candidate_id#185]
         +- Filter (total#201L = 3)
            +- HashAggregate(keys=[candidate_id#185], functions=[count(candidate_id#185)])
               +- Exchange hashpartitioning(candidate_id#185, 200), ENSURE_REQUIREMENTS, [plan_id=442]
                  +- HashAggregate(keys=[candidate_id#185], functions=[partial_count(candidate_id#185)])
                     +- Project [candidate_id#185]
                        

import org.apache.spark.sql.types.{IntegerType, StringType, StructType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
schema: org.apache.spark.sql.types.StructType = StructType(StructField(candidate_id,IntegerType,true),StructField(skills,StringType,true))
data: Seq[org.apache.spark.sql.Row] = List([123,Python], [123,Tableau], [123,PostgreSQL], [234,R], [234,PowerBI], [234,SQL Server], [345,Python], [345,Tableau])
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = ParallelCollectionRDD[53] at parallelize at <console>:98
df: org.apache.spark.sql.DataFrame = [candidate_id: int, skills: string]
df1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [candidate_id: int]


In [16]:
println("Using Spark SQL -------- ")

df.createOrReplaceTempView("candidates")

val df2 = spark.sql("""
    SELECT
        candidate_id
    FROM candidates
    WHERE skills IN("Python","Tableau","PostgreSQL")
    GROUP BY candidate_id
    HAVING COUNT(skills)=3
    ORDER BY candidate_id
""")

df2.explain()
df2.show(false)



Using Spark SQL -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [candidate_id#185 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(candidate_id#185 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=643]
      +- Project [candidate_id#185]
         +- Filter (count(skills#186)#225L = 3)
            +- HashAggregate(keys=[candidate_id#185], functions=[count(skills#186)])
               +- Exchange hashpartitioning(candidate_id#185, 200), ENSURE_REQUIREMENTS, [plan_id=638]
                  +- HashAggregate(keys=[candidate_id#185], functions=[partial_count(skills#186)])
                     +- Filter skills#186 IN (Python,Tableau,PostgreSQL)
                        +- Scan ExistingRDD[candidate_id#185,skills#186]


+------------+
|candidate_id|
+------------+
|123         |
+------------+



df2: org.apache.spark.sql.DataFrame = [candidate_id: int]
