In [1]:
// Assume you're given a table containing job postings from various companies on the LinkedIn platform. Write a query to retrieve the count of companies that have posted duplicate job listings.

// Definition:

// Duplicate job listings are defined as two job listings within the same company that share identical titles and descriptions.

// Example Output:
// duplicate_companies
// --------------------
// 1

val df = Seq(
  (248,827,"Business Analyst","Business analyst evaluates past and current business data with the primary goal of improving decision-making processes within organizations."),
  (149,845,"Business Analyst","Business analyst evaluates past and current business data with the primary goal of improving decision-making processes within organizations."),
  (945,345,"Data Analyst","Data analyst reviews data to identify key insights into a business's customers and ways the data can be used to solve problems."),
  (164,345,"Data Analyst","Data analyst reviews data to identify key insights into a business's customers and ways the data can be used to solve problems."),
  (172,244,"Data Engineer","Data engineer works in a variety of settings to build systems that collect, manage, and convert raw data into usable information for data scientists and business analysts to interpret.")
).toDF("job_id","company_id","title","description")


df.show(false)


Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.5:4040
SparkContext available as 'sc' (version = 3.4.1, master = local[*], app id = local-1703499162287)
SparkSession available as 'spark'


+------+----------+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|job_id|company_id|title           |description                                                                                                                                                                             |
+------+----------+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|248   |827       |Business Analyst|Business analyst evaluates past and current business data with the primary goal of improving decision-making processes within organizations.                                            |
|149   |845       |Business Analyst|Business analyst evaluates past and current business data with the primary g

df: org.apache.spark.sql.DataFrame = [job_id: int, company_id: int ... 2 more fields]


In [17]:
// Using Dataframe API

val df1 = df.groupBy($"company_id", $"title", $"description").count().where($"count" > 1)

df1.explain()
df1.show(false)


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (count#312L > 1)
   +- HashAggregate(keys=[company_id#14, title#15, description#16], functions=[count(1)])
      +- Exchange hashpartitioning(company_id#14, title#15, description#16, 200), ENSURE_REQUIREMENTS, [plan_id=859]
         +- HashAggregate(keys=[company_id#14, title#15, description#16], functions=[partial_count(1)])
            +- LocalTableScan [company_id#14, title#15, description#16]


+----------+------------+-------------------------------------------------------------------------------------------------------------------------------+-----+
|company_id|title       |description                                                                                                                    |count|
+----------+------------+-------------------------------------------------------------------------------------------------------------------------------+-----+
|345       |Data Analyst|Data analyst reviews data t

df1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [company_id: int, title: string ... 2 more fields]


In [14]:
// Using Spark SQL

df.createOrReplaceTempView("jobs")

val df2 = spark.sql("""
    SELECT 
        COUNT(company_id) AS duplicate_companies
    FROM
    (
        SELECT
            company_id,
            title,
            description,
            COUNT(job_id) AS job_count
        FROM jobs
        GROUP BY company_id,title,description
    ) tmp
    WHERE job_count > 1
""")

df2.explain()
df2.show(false)


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[count(1)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=731]
      +- HashAggregate(keys=[], functions=[partial_count(1)])
         +- Project
            +- Filter (job_count#289L > 1)
               +- HashAggregate(keys=[company_id#14, title#15, description#16], functions=[count(1)])
                  +- Exchange hashpartitioning(company_id#14, title#15, description#16, 200), ENSURE_REQUIREMENTS, [plan_id=725]
                     +- HashAggregate(keys=[company_id#14, title#15, description#16], functions=[partial_count(1)])
                        +- LocalTableScan [company_id#14, title#15, description#16]


+-------------------+
|duplicate_companies|
+-------------------+
|1                  |
+-------------------+



df2: org.apache.spark.sql.DataFrame = [duplicate_companies: bigint]
