In [7]:
import spark.implicits._

val friends_df = Seq(
    (1, 2),
    (1, 3),
    (1, 4),
    (2, 1),
    (3, 1),
    (3, 4),
    (4, 1),
    (4, 3)
).toDF("user_id", "friend_id")

val likes_df = Seq(
    (1, "A"),
    (1, "B"),
    (1, "C"),
    (2, "A"),
    (3, "B"),
    (3, "C"),
    (4, "B")
).toDF("user_id", "page_id")


friends_df.show(false)
likes_df.show(false)


+-------+---------+
|user_id|friend_id|
+-------+---------+
|1      |2        |
|1      |3        |
|1      |4        |
|2      |1        |
|3      |1        |
|3      |4        |
|4      |1        |
|4      |3        |
+-------+---------+

+-------+-------+
|user_id|page_id|
+-------+-------+
|1      |A      |
|1      |B      |
|1      |C      |
|2      |A      |
|3      |B      |
|3      |C      |
|4      |B      |
+-------+-------+



import spark.implicits._
friends_df: org.apache.spark.sql.DataFrame = [user_id: int, friend_id: int]
likes_df: org.apache.spark.sql.DataFrame = [user_id: int, page_id: string]


In [8]:

friends_df.createOrReplaceTempView("friends")
likes_df.createOrReplaceTempView("likes")


In [15]:
spark.sql("""
    with common_likes as (
        select 
            f.user_id,
            l.page_id
        from friends f inner join likes l
        on f.friend_id=l.user_id
    )
    select 
        user_id,
        page_id
    from common_likes
    where (user_id, page_id) not in (select user_id, page_id from likes)
    order by user_id
""").show(false)

+-------+-------+
|user_id|page_id|
+-------+-------+
|2      |B      |
|2      |C      |
|3      |A      |
|4      |C      |
|4      |C      |
|4      |A      |
+-------+-------+



In [22]:
val common_likes_df = friends_df.as("f").join(likes_df.alias("l"), $"f.friend_id" === $"l.user_id", "inner"
                       ).select("f.user_id", "l.page_id"
                               )

// NOT IN REFERENCE TO ANOTHER TABLE SYNTAX IS EQUIVALENT TO LEFT_ANTI JOIN SYNTAX
common_likes_df.as("cl").join(likes_df.as("l"), $"cl.user_id" === $"l.user_id" &&
                              $"cl.page_id" === $"l.page_id", "left_anti").orderBy($"user_id").show(false)


+-------+-------+
|user_id|page_id|
+-------+-------+
|2      |B      |
|2      |C      |
|3      |A      |
|4      |C      |
|4      |C      |
|4      |A      |
+-------+-------+



common_likes_df: org.apache.spark.sql.DataFrame = [user_id: int, page_id: string]
