In [20]:
// Assume you're given two tables containing data about Facebook Pages and their respective likes (as in "Like a Facebook Page").

// Write a query to return the IDs of the Facebook pages that have zero likes. The output should be sorted in ascending order based on the page IDs.

// Example Output:
// page_id
// 20701

import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, StringType}


val pages_data = Seq(
  Row(20001,"SQL Solutions"),
  Row(20045,"Brain Exercises"),
  Row(20701,"Tips for Data Analysts")
)

val pages_schema = StructType(Array(
    StructField("page_id", IntegerType),
    StructField("page_name", StringType)
  )
)

val page_likes_data = Seq(
  Row(111,20001,"04/08/2022 00:00:00"),
  Row(121,20045,"03/12/2022 00:00:00"),
  Row(156,20001,"07/25/2022 00:00:00")
)

val page_likes_schema = StructType(Array(
  StructField("user_id", IntegerType),
  StructField("page_id", IntegerType),
  StructField("liked_date", StringType)
))

val pages_rdd = spark.sparkContext.parallelize(pages_data)
val pages_df = spark.createDataFrame(pages_rdd, pages_schema)

val likes_rdd = spark.sparkContext.parallelize(page_likes_data)
val likes_df = spark.createDataFrame(likes_rdd, page_likes_schema)


pages_df.show(false)
likes_df.show(false)


println("Using Dataframes -------- ")

val df1 = pages_df.as("p").join(likes_df.as("l"), $"l.page_id" === $"p.page_id", "left"
                               ).where($"l.page_id".isNull).select("p.page_id")


df1.explain()
df1.show(false)


+-------+----------------------+
|page_id|page_name             |
+-------+----------------------+
|20001  |SQL Solutions         |
|20045  |Brain Exercises       |
|20701  |Tips for Data Analysts|
+-------+----------------------+

+-------+-------+-------------------+
|user_id|page_id|liked_date         |
+-------+-------+-------------------+
|111    |20001  |04/08/2022 00:00:00|
|121    |20045  |03/12/2022 00:00:00|
|156    |20001  |07/25/2022 00:00:00|
+-------+-------+-------------------+

Using Dataframes -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [page_id#648]
   +- Filter isnull(page_id#656)
      +- SortMergeJoin [page_id#648], [page_id#656], LeftOuter
         :- Sort [page_id#648 ASC NULLS FIRST], false, 0
         :  +- Exchange hashpartitioning(page_id#648, 200), ENSURE_REQUIREMENTS, [plan_id=1057]
         :     +- Project [page_id#648]
         :        +- Scan ExistingRDD[page_id#648,page_name#649]
         +- Sort [page_id#656 ASC NULLS

import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, StringType}
pages_data: Seq[org.apache.spark.sql.Row] = List([20001,SQL Solutions], [20045,Brain Exercises], [20701,Tips for Data Analysts])
pages_schema: org.apache.spark.sql.types.StructType = StructType(StructField(page_id,IntegerType,true),StructField(page_name,StringType,true))
page_likes_data: Seq[org.apache.spark.sql.Row] = List([111,20001,04/08/2022 00:00:00], [121,20045,03/12/2022 00:00:00], [156,20001,07/25/2022 00:00:00])
page_likes_schema: org.apache.spark.sql.types.StructType = StructType(StructField(user_id,IntegerType,true),StructField(page_id,IntegerType,true),StructField(liked_date,StringType,true))
pages_rdd: org.apache.spark....


In [24]:
println("Using Spark SQL -------- ")

pages_df.createOrReplaceTempView("pages")
likes_df.createOrReplaceTempView("likes")

val df2 = spark.sql("""
    SELECT
        p.page_id
    FROM pages p
    LEFT JOIN likes l
    ON p.page_id=l.page_id
    WHERE l.page_id IS NULL
""")

df2.explain()

df2.show(false)



Using Spark SQL -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [page_id#648]
   +- Filter isnull(page_id#656)
      +- SortMergeJoin [page_id#648], [page_id#656], LeftOuter
         :- Sort [page_id#648 ASC NULLS FIRST], false, 0
         :  +- Exchange hashpartitioning(page_id#648, 200), ENSURE_REQUIREMENTS, [plan_id=1695]
         :     +- Project [page_id#648]
         :        +- Scan ExistingRDD[page_id#648,page_name#649]
         +- Sort [page_id#656 ASC NULLS FIRST], false, 0
            +- Exchange hashpartitioning(page_id#656, 200), ENSURE_REQUIREMENTS, [plan_id=1696]
               +- Project [page_id#656]
                  +- Filter isnotnull(page_id#656)
                     +- Scan ExistingRDD[user_id#655,page_id#656,liked_date#657]


+-------+
|page_id|
+-------+
|20701  |
+-------+



df2: org.apache.spark.sql.DataFrame = [page_id: int]
