In [1]:
# enable pyspark
import findspark
findspark.init()

In [3]:
'''
Scripts instantiates a SparkSession locally with 8 worker threads.
'''
appName = "Parquet Schema Merge"
master = "local[8]"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Create Spark session
conf = SparkConf().setMaster(master)
spark = SparkSession.builder.config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()
# INFO/WARN/DEBUG
# https://kontext.tech/column/spark/457/tutorial-turn-off-info-logs-in-spark
spark.sparkContext.setLogLevel("INFO")

Code snippet simply create three dataframes from Python dictionary list. The schema for the data frame will be inferred automatically though the recommended approach is to specify the schema manually. 

In [4]:
data1 = [{"id": 1, "attr0": "Attr 0"}, 
{"id": 2, "attr0": "Attr 0"}]
df1 = spark.createDataFrame(data1)

data2 = [{"id": 1, "attr0": "Attr 0", "attr1": "Attr 1"}, 
{"id": 2, "attr0": "Attr 0", "attr1": "Attr 1"}]
df2 = spark.createDataFrame(data2)

data3= [{"id": 1, "attr1": "Attr 1"}, 
{"id": 2, "attr1": "Attr 1"}]
df3 = spark.createDataFrame(data3)

df1.write.mode('overwrite').parquet('data/partition-date=2020-01-01')
df2.write.mode('overwrite').parquet('data/partition-date=2020-01-02')
df3.write.mode('overwrite').parquet('data/partition-date=2020-01-03')

### See info logs
Make sure you set log level to INFO

- Dataframe1
<pre>
22/02/09 06:50:46 INFO ParquetWriteSupport: Initialized Parquet WriteSupport with Catalyst schema:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "attr0",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "id",
    "type" : "long",
    "nullable" : true,
    "metadata" : { }
  } ]
}
and corresponding Parquet message type:
message spark_schema {
  optional binary attr0 (UTF8);
  optional int64 id;
}
</pre>

- DataFrame2
<pre>
22/02/09 06:50:56 INFO ParquetWriteSupport: Initialized Parquet WriteSupport with Catalyst schema:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "attr0",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "attr1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "id",
    "type" : "long",
    "nullable" : true,
    "metadata" : { }
  } ]
}
and corresponding Parquet message type:
message spark_schema {
  optional binary attr0 (UTF8);
  optional binary attr1 (UTF8);
  optional int64 id;
}
</pre>

- DataFrame3
<pre>
22/02/09 06:51:07 INFO ParquetWriteSupport: Initialized Parquet WriteSupport with Catalyst schema:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "attr1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "id",
    "type" : "long",
    "nullable" : true,
    "metadata" : { }
  } ]
}
and corresponding Parquet message type:
message spark_schema {
  optional binary attr1 (UTF8);
  optional int64 id;
}
</pre>

### Schema merge at time of Read
Spark read with merge schema option

In [31]:
df = spark.read.option("mergeSchema","true").parquet("data")
df.show()

+------+---+------+--------------+
| attr0| id| attr1|partition-date|
+------+---+------+--------------+
|Attr 0|  1|Attr 1|    2020-01-02|
|Attr 0|  2|Attr 1|    2020-01-02|
|Attr 0|  1|  null|    2020-01-01|
|Attr 0|  2|  null|    2020-01-01|
|  null|  1|Attr 1|    2020-01-03|
|  null|  2|Attr 1|    2020-01-03|
+------+---+------+--------------+



>If we don't specify mergeSchema option, the new attributes will not be picked up.
Without schema merge, the schema will be decided randomly based on on of the 
partition files. 

In [32]:
df = spark.read.parquet("data")
df.show()

+------+---+--------------+
| attr0| id|partition-date|
+------+---+--------------+
|Attr 0|  1|    2020-01-02|
|Attr 0|  2|    2020-01-02|
|Attr 0|  1|    2020-01-01|
|Attr 0|  2|    2020-01-01|
|  null|  1|    2020-01-03|
|  null|  2|    2020-01-03|
+------+---+--------------+



### Use Spark SQL
- Alternatively, we can also use Spark SQL option to enable schema merge.
- The result is same as using mergeSchema option. The advantage of using this option is that it is effective in the whole Spark session instead of specifying it in all read functions like spark.read.option("mergeSchema","true").parquet("data")

In [34]:
spark.conf.set("spark.sql.parquet.mergeSchema", "true")
df = spark.read.parquet("data")
df.show()
df.printSchema()

+------+---+------+--------------+
| attr0| id| attr1|partition-date|
+------+---+------+--------------+
|Attr 0|  1|Attr 1|    2020-01-02|
|Attr 0|  2|Attr 1|    2020-01-02|
|Attr 0|  1|  null|    2020-01-01|
|Attr 0|  2|  null|    2020-01-01|
|  null|  1|Attr 1|    2020-01-03|
|  null|  2|Attr 1|    2020-01-03|
+------+---+------+--------------+

root
 |-- attr0: string (nullable = true)
 |-- id: long (nullable = true)
 |-- attr1: string (nullable = true)
 |-- partition-date: date (nullable = true)



### Schema merge at the time of  write/create
- Schema merge at the time of write
- Schema change with no data type conflict

In [35]:
df1.write.format("delta").mode('overwrite').option("mergeSchema", True).parquet('data/partition-date=2020-01-01')
df2.write.format("delta").mode('overwrite').option("mergeSchema", True).parquet('data/partition-date=2020-01-02')
df3.write.format("delta").mode('overwrite').option("mergeSchema", True).parquet('data/partition-date=2020-01-03')

In [36]:
df = spark.read.parquet("data")
df.show()
df.printSchema()

+------+---+------+--------------+
| attr0| id| attr1|partition-date|
+------+---+------+--------------+
|Attr 0|  1|Attr 1|    2020-01-02|
|Attr 0|  2|Attr 1|    2020-01-02|
|Attr 0|  1|  null|    2020-01-01|
|Attr 0|  2|  null|    2020-01-01|
|  null|  1|Attr 1|    2020-01-03|
|  null|  2|Attr 1|    2020-01-03|
+------+---+------+--------------+

root
 |-- attr0: string (nullable = true)
 |-- id: long (nullable = true)
 |-- attr1: string (nullable = true)
 |-- partition-date: date (nullable = true)

