In [8]:
// Merge schema is supported only by parquet not by other format like csv , txt.
// Mergeschema (spark.sql.parquet.mergeSchema) will align the columns in the correct order even they are distributed

import org.apache.spark.sql.SaveMode

// Create a simple DataFrame, store into a partition directory
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
squaresDF.write.mode(SaveMode.Overwrite).parquet("data/test_table/key=1")

val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i+1 ,i * i * i)).toDF("value", "dummy_col","cube")
cubesDF.write.mode(SaveMode.Overwrite).parquet("data/test_table/key=2")


squaresDF.printSchema()
squaresDF.show(false)

cubesDF.printSchema()
cubesDF.show(false)

root
 |-- value: integer (nullable = false)
 |-- square: integer (nullable = false)

+-----+------+
|value|square|
+-----+------+
|1    |1     |
|2    |4     |
|3    |9     |
|4    |16    |
|5    |25    |
+-----+------+

root
 |-- value: integer (nullable = false)
 |-- dummy_col: integer (nullable = false)
 |-- cube: integer (nullable = false)

+-----+---------+----+
|value|dummy_col|cube|
+-----+---------+----+
|6    |7        |216 |
|7    |8        |343 |
|8    |9        |512 |
|9    |10       |729 |
|10   |11       |1000|
+-----+---------+----+



import org.apache.spark.sql.SaveMode
squaresDF: org.apache.spark.sql.DataFrame = [value: int, square: int]
cubesDF: org.apache.spark.sql.DataFrame = [value: int, dummy_col: int ... 1 more field]


In [9]:

val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")

mergedDF.show(false)
mergedDF.printSchema()


+-----+------+---------+----+---+
|value|square|dummy_col|cube|key|
+-----+------+---------+----+---+
|9    |null  |10       |729 |2  |
|10   |null  |11       |1000|2  |
|8    |null  |9        |512 |2  |
|6    |null  |7        |216 |2  |
|7    |null  |8        |343 |2  |
|4    |16    |null     |null|1  |
|5    |25    |null     |null|1  |
|2    |4     |null     |null|1  |
|1    |1     |null     |null|1  |
|3    |9     |null     |null|1  |
+-----+------+---------+----+---+

root
 |-- value: integer (nullable = true)
 |-- square: integer (nullable = true)
 |-- dummy_col: integer (nullable = true)
 |-- cube: integer (nullable = true)
 |-- key: integer (nullable = true)



mergedDF: org.apache.spark.sql.DataFrame = [value: int, square: int ... 3 more fields]
