In [87]:
import spark.implicits._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.Row


def renameColumns(df: DataFrame, prefix:String) = {
  val cols= df.columns.map(t=> col(t).as(prefix + "_" + t))
  df.select(cols: _*)
}

val seq1 = Seq(("1", "test", "value"), ("2", "test", "value3")).toDF("id", "key", "value")
val df1 = renameColumns(seq1, "left")

val seq2 = Seq(("1", "test", "value"), ("2", "test2", "value4")).toDF("id", "key", "value")
val df2 = renameColumns(seq2, "right")

val joined = df1.join(df2, df1.col("left_id") === df2.col("right_id"), "outer")

joined.show(false)


+-------+--------+----------+--------+---------+-----------+
|left_id|left_key|left_value|right_id|right_key|right_value|
+-------+--------+----------+--------+---------+-----------+
|1      |test    |value     |1       |test     |value      |
|2      |test    |value3    |2       |test2    |value4     |
+-------+--------+----------+--------+---------+-----------+



import spark.implicits._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.Row
renameColumns: (df: org.apache.spark.sql.DataFrame, prefix: String)org.apache.spark.sql.DataFrame
seq1: org.apache.spark.sql.DataFrame = [id: string, key: string ... 1 more field]
df1: org.apache.spark.sql.DataFrame = [left_id: string, left_key: string ... 1 more field]
seq2: org.apache.spark.sql.DataFrame = [id: string, key: string ... 1 more field]
df2: org.apache.spark.sql.DataFrame = [right_id: string, right_key: string ... 1 more field]
joined: org.apache.spark.sql.DataFrame = [left_id: string, left_key: string ... 4 more fields]


In [129]:
implicit val enc = RowEncoder(joined.schema)

val diff = joined.map(eachRow => {
  val values = eachRow.getValuesMap(eachRow.schema.fieldNames)
  val left: Map[String, Any] = values.filterKeys(t => t.contains("left"))
  val right: Map[String, Any] = values.filterKeys(t => t.contains("right"))

  val row = eachRow.schema.fieldNames.map(fieldName => {
    if (fieldName.contains("left_")) {
      val leftVal = left.getOrElse(fieldName, "")
      val rightVal = right.getOrElse("right_" + fieldName.stripPrefix("left_"), "")
      if (leftVal == rightVal) "" else leftVal
    }
    else {
      val leftVal = left.getOrElse("left_" + fieldName.stripPrefix("right_"), "")
      val rightVal = right.getOrElse(fieldName, "")
      if (leftVal == rightVal) "" else rightVal
    }
  })
  // (eachRow.getAs[String]("left_id"), eachRow.getAs[String]("right_id"), eachRow.getAs[String]("right_key"))
  Row.fromSeq(row)
})

diff.show(false)

+-------+--------+----------+--------+---------+-----------+
|left_id|left_key|left_value|right_id|right_key|right_value|
+-------+--------+----------+--------+---------+-----------+
|       |        |          |        |         |           |
|       |test    |value3    |        |test2    |value4     |
+-------+--------+----------+--------+---------+-----------+



enc: org.apache.spark.sql.catalyst.encoders.ExpressionEncoder[org.apache.spark.sql.Row] = class[left_id[0]: string, left_key[0]: string, left_value[0]: string, right_id[0]: string, right_key[0]: string, right_value[0]: string]
diff: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [left_id: string, left_key: string ... 4 more fields]


In [130]:
joined.map(t => {
    val values = t.getValuesMap(t.schema.fieldNames)
    values
}).show(false)

+-----------------------------------------------------------------------------------------------------------------+
|value                                                                                                            |
+-----------------------------------------------------------------------------------------------------------------+
|{right_value -> null, left_key -> null, left_value -> null, right_key -> null, right_id -> null, left_id -> null}|
|{right_value -> null, left_key -> null, left_value -> null, right_key -> null, right_id -> null, left_id -> null}|
+-----------------------------------------------------------------------------------------------------------------+



In [131]:
val df = df1.join(df2, df1.col("left_id") === df2.col("right_id"), "outer")

df.map(t => {
    t.schema.fieldNames.map(t1 => {
      t.getAs[String](0)
  })
})

df: org.apache.spark.sql.DataFrame = [left_id: string, left_key: string ... 4 more fields]
res112: org.apache.spark.sql.Dataset[Array[String]] = [value: array<string>]


In [128]:
val df = df1.join(df2, df1.col("left_id") === df2.col("right_id"), "outer")

df.map(t => {
    t.schema.fieldNames.map(t1 => {
     t1
  })
}).show(false)

+-----------------------------------------------------------------+
|value                                                            |
+-----------------------------------------------------------------+
|[left_id, left_key, left_value, right_id, right_key, right_value]|
|[left_id, left_key, left_value, right_id, right_key, right_value]|
+-----------------------------------------------------------------+



df: org.apache.spark.sql.DataFrame = [left_id: string, left_key: string ... 4 more fields]
