In [7]:
/* Problem Statement:
- For pairs of brands in the same year (e.g. apple/samsung/2020 and samsung/apple/2020) 
    - if custom1 = custom3 and custom2 = custom4 : then keep only one pair

- For pairs of brands in the same year 
    - if custom1 != custom3 OR custom2 != custom4 : then keep both pairs

- For brands that do not have pairs in the same year : keep those rows as well
*/

import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._

val data = Seq(
  ("apple","samsung",2020,1,2,1,2),
  ("samsung","apple",2020,1,2,1,2),
  ("apple","samsung",2021,1,2,5,3),
  ("samsung","apple",2021,5,3,1,2),
  ("google",null.asInstanceOf[String],2020,5,9,null.asInstanceOf[Int],null.asInstanceOf[Int]),
  ("oneplus","nothing",2020,5,9,6,3)
).toDF("brand1","brand2","year","custom1","custom2","custom3","custom4")

data.show(false)


+-------+-------+----+-------+-------+-------+-------+
|brand1 |brand2 |year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
|apple  |samsung|2020|1      |2      |1      |2      |
|samsung|apple  |2020|1      |2      |1      |2      |
|apple  |samsung|2021|1      |2      |5      |3      |
|samsung|apple  |2021|5      |3      |1      |2      |
|google |null   |2020|5      |9      |0      |0      |
|oneplus|nothing|2020|5      |9      |6      |3      |
+-------+-------+----+-------+-------+-------+-------+



import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
data: org.apache.spark.sql.DataFrame = [brand1: string, brand2: string ... 5 more fields]


In [10]:

data.createOrReplaceTempView("brand_tbl")

val df1 = spark.sql("""
WITH CTE1 AS (
    SELECT 
        *,
        CASE WHEN brand1<brand2 THEN CONCAT(brand1,brand2,year) ELSE CONCAT(brand2,brand1,year) END pairs
    FROM brand_tbl  
), 
CTE2 AS (
    SELECT 
        *,
        ROW_NUMBER() OVER(PARTITION BY pairs order by pairs) rn
    FROM CTE1
)  
SELECT 
    brand1,brand2,year,custom1,custom2,custom3,custom4
FROM CTE2 WHERE rn=1
OR custom1 <> custom3 and custom2 <> custom4
ORDER BY brand1, year
""")

df1.show(false)
df1.explain()

+-------+-------+----+-------+-------+-------+-------+
|brand1 |brand2 |year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
|apple  |samsung|2020|1      |2      |1      |2      |
|apple  |samsung|2021|1      |2      |5      |3      |
|google |null   |2020|5      |9      |0      |0      |
|oneplus|nothing|2020|5      |9      |6      |3      |
|samsung|apple  |2021|5      |3      |1      |2      |
+-------+-------+----+-------+-------+-------+-------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [brand1#87 ASC NULLS FIRST, year#89 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(brand1#87 ASC NULLS FIRST, year#89 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=293]
      +- Project [brand1#87, brand2#88, year#89, custom1#90, custom2#91, custom3#92, custom4#93]
         +- Filter ((rn#183 = 1) OR (NOT (custom1#90 = custom3#92) AND NOT (custom2#91 = custom4#93)))
            +- Window [row_number() windowspecdefini

df1: org.apache.spark.sql.DataFrame = [brand1: string, brand2: string ... 5 more fields]


In [17]:
val df2 = data.withColumn("pairs", when($"brand1" < $"brand2", concat($"brand1",$"brand2",$"year")
                             ).otherwise(concat($"brand2",$"brand1",$"year")) 
    ).withColumn("rn",row_number().over(Window.partitionBy($"pairs").orderBy($"pairs"))
    ).filter($"rn" === 1 || ($"custom1" =!= $"custom3" && $"custom2" =!= $"custom4")
    ).select("brand1","brand2","year","custom1","custom2","custom3","custom4"
    ).orderBy($"brand1",$"year")


df2.show(false)
df2.explain()

+-------+-------+----+-------+-------+-------+-------+
|brand1 |brand2 |year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
|apple  |samsung|2020|1      |2      |1      |2      |
|apple  |samsung|2021|1      |2      |5      |3      |
|google |null   |2020|5      |9      |0      |0      |
|oneplus|nothing|2020|5      |9      |6      |3      |
|samsung|apple  |2021|5      |3      |1      |2      |
+-------+-------+----+-------+-------+-------+-------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [brand1#87 ASC NULLS FIRST, year#89 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(brand1#87 ASC NULLS FIRST, year#89 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=533]
      +- Project [brand1#87, brand2#88, year#89, custom1#90, custom2#91, custom3#92, custom4#93]
         +- Filter ((rn#361 = 1) OR (NOT (custom1#90 = custom3#92) AND NOT (custom2#91 = custom4#93)))
            +- Window [row_number() windowspecdefini

df2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [brand1: string, brand2: string ... 5 more fields]


In [None]:
// SQL stmts

// CREATE TABLE brands 
// (
//     brand1      VARCHAR(20),
//     brand2      VARCHAR(20),
//     year        INT,
//     custom1     INT,
//     custom2     INT,
//     custom3     INT,
//     custom4     INT
// );
// INSERT INTO brands VALUES ('apple', 'samsung', 2020, 1, 2, 1, 2);
// INSERT INTO brands VALUES ('samsung', 'apple', 2020, 1, 2, 1, 2);
// INSERT INTO brands VALUES ('apple', 'samsung', 2021, 1, 2, 5, 3);
// INSERT INTO brands VALUES ('samsung', 'apple', 2021, 5, 3, 1, 2);
// INSERT INTO brands VALUES ('google', NULL, 2020, 5, 9, NULL, NULL);
// INSERT INTO brands VALUES ('oneplus', 'nothing', 2020, 5, 9, 6, 3);