In [1]:
# enable pyspark
import findspark
findspark.init()

In [3]:
'''
Scripts instantiates a SparkSession locally with 8 worker threads.
'''
appName = "Parquet Schema Merge"
master = "local[8]"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Create Spark session
conf = SparkConf().setMaster(master)
spark = SparkSession.builder.config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()
# INFO/WARN/DEBUG
# https://kontext.tech/column/spark/457/tutorial-turn-off-info-logs-in-spark
spark.sparkContext.setLogLevel("INFO")

Code snippet simply create three dataframes from Python dictionary list. The schema for the data frame will be inferred automatically though the recommended approach is to specify the schema manually. 

In [4]:
data1 = [{"id": 1, "attr0": "Attr 0"}, 
{"id": 2, "attr0": "Attr 0"}]
df1 = spark.createDataFrame(data1)

data2 = [{"id": 1, "attr0": "Attr 0", "attr1": "Attr 1"}, 
{"id": 2, "attr0": "Attr 0", "attr1": "Attr 1"}]
df2 = spark.createDataFrame(data2)

data3= [{"id": 1, "attr1": "Attr 1"}, 
{"id": 2, "attr1": "Attr 1"}]
df3 = spark.createDataFrame(data3)

df1.write.mode('overwrite').parquet('data/partition-date=2020-01-01')
df2.write.mode('overwrite').parquet('data/partition-date=2020-01-02')
df3.write.mode('overwrite').parquet('data/partition-date=2020-01-03')

### See info logs
Make sure you set log level to INFO

- Dataframe1
<pre>
22/02/09 06:50:46 INFO ParquetWriteSupport: Initialized Parquet WriteSupport with Catalyst schema:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "attr0",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "id",
    "type" : "long",
    "nullable" : true,
    "metadata" : { }
  } ]
}
and corresponding Parquet message type:
message spark_schema {
  optional binary attr0 (UTF8);
  optional int64 id;
}
</pre>

- DataFrame2
<pre>
22/02/09 06:50:56 INFO ParquetWriteSupport: Initialized Parquet WriteSupport with Catalyst schema:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "attr0",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "attr1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "id",
    "type" : "long",
    "nullable" : true,
    "metadata" : { }
  } ]
}
and corresponding Parquet message type:
message spark_schema {
  optional binary attr0 (UTF8);
  optional binary attr1 (UTF8);
  optional int64 id;
}
</pre>

- DataFrame3
<pre>
22/02/09 06:51:07 INFO ParquetWriteSupport: Initialized Parquet WriteSupport with Catalyst schema:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "attr1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "id",
    "type" : "long",
    "nullable" : true,
    "metadata" : { }
  } ]
}
and corresponding Parquet message type:
message spark_schema {
  optional binary attr1 (UTF8);
  optional int64 id;
}
</pre>

### Schema merge at time of Read
Spark read with merge schema option

In [31]:
df = spark.read.option("mergeSchema","true").parquet("data")
df.show()

+------+---+------+--------------+
| attr0| id| attr1|partition-date|
+------+---+------+--------------+
|Attr 0|  1|Attr 1|    2020-01-02|
|Attr 0|  2|Attr 1|    2020-01-02|
|Attr 0|  1|  null|    2020-01-01|
|Attr 0|  2|  null|    2020-01-01|
|  null|  1|Attr 1|    2020-01-03|
|  null|  2|Attr 1|    2020-01-03|
+------+---+------+--------------+



>If we don't specify mergeSchema option, the new attributes will not be picked up.
Without schema merge, the schema will be decided randomly based on on of the 
partition files. 

In [32]:
df = spark.read.parquet("data")
df.show()

+------+---+--------------+
| attr0| id|partition-date|
+------+---+--------------+
|Attr 0|  1|    2020-01-02|
|Attr 0|  2|    2020-01-02|
|Attr 0|  1|    2020-01-01|
|Attr 0|  2|    2020-01-01|
|  null|  1|    2020-01-03|
|  null|  2|    2020-01-03|
+------+---+--------------+



### Use Spark SQL
- Alternatively, we can also use Spark SQL option to enable schema merge.
- The result is same as using mergeSchema option. The advantage of using this option is that it is effective in the whole Spark session instead of specifying it in all read functions like spark.read.option("mergeSchema","true").parquet("data")

In [34]:
spark.conf.set("spark.sql.parquet.mergeSchema", "true")
df = spark.read.parquet("data")
df.show()
df.printSchema()

+------+---+------+--------------+
| attr0| id| attr1|partition-date|
+------+---+------+--------------+
|Attr 0|  1|Attr 1|    2020-01-02|
|Attr 0|  2|Attr 1|    2020-01-02|
|Attr 0|  1|  null|    2020-01-01|
|Attr 0|  2|  null|    2020-01-01|
|  null|  1|Attr 1|    2020-01-03|
|  null|  2|Attr 1|    2020-01-03|
+------+---+------+--------------+

root
 |-- attr0: string (nullable = true)
 |-- id: long (nullable = true)
 |-- attr1: string (nullable = true)
 |-- partition-date: date (nullable = true)



### Schema merge at the time of  write/create
- Schema merge at the time of write
- Schema change with no data type conflict

In [35]:
df1.write.format("delta").mode('overwrite').option("mergeSchema", True).parquet('data/partition-date=2020-01-01')
df2.write.format("delta").mode('overwrite').option("mergeSchema", True).parquet('data/partition-date=2020-01-02')
df3.write.format("delta").mode('overwrite').option("mergeSchema", True).parquet('data/partition-date=2020-01-03')

In [36]:
df = spark.read.parquet("data")
df.show()
df.printSchema()

+------+---+------+--------------+
| attr0| id| attr1|partition-date|
+------+---+------+--------------+
|Attr 0|  1|Attr 1|    2020-01-02|
|Attr 0|  2|Attr 1|    2020-01-02|
|Attr 0|  1|  null|    2020-01-01|
|Attr 0|  2|  null|    2020-01-01|
|  null|  1|Attr 1|    2020-01-03|
|  null|  2|Attr 1|    2020-01-03|
+------+---+------+--------------+

root
 |-- attr0: string (nullable = true)
 |-- id: long (nullable = true)
 |-- attr1: string (nullable = true)
 |-- partition-date: date (nullable = true)



### Spark Merge Two DataFrames with Different Columns

In [39]:
data1 = [{"name": "James", "dept": "Sales", "age": 34}, 
         {"name": "Michael", "dept": "Sales", "age": 56},
         {"name": "Robert", "dept": "Sales", "age": 34},
         {"name": "Maria", "dept": "Finance", "age": 24}]
df1 = spark.createDataFrame(data1)
df1.printSchema()


#val data2=Seq(("James","Sales","NY",9000),("Maria","Finance","CA",9000),
#              ("Jen","Finance","NY",7900),("Jeff","Marketing","CA",8000))
# data2.toDF("name","dept","state","salary")

data2 = [{"name": "James", "dept": "Sales", "state": 'NY', "salary":9000}, 
         {"name": "Maria", "dept": "Finance", "state": 'CA', "salary":9000},
         {"name": "Jen", "dept": "Finance", "state": 'NY', "salary":7900},
         {"name": "Jeff", "dept": "Marketing", "state": 'CA', "salary":8000}]    


df2 = spark.createDataFrame(data2)
df2.printSchema()

root
 |-- age: long (nullable = true)
 |-- dept: string (nullable = true)
 |-- name: string (nullable = true)

root
 |-- dept: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- state: string (nullable = true)



### dataframe union vs unionByName

- union : this function resolves columns by position (not by name). That is the reason why when you union 2 dataframes the values may be swapped and one column from second dataframe is missing.

- unionByName : You should use unionByName, but this functions requires both dataframe to have the same structure.
See below, simple code to harmonize the structure of your dataframes and then do the union(ByName).

In [49]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

# harmonize dataframes
def add_missing_columns(df: DataFrame, ref_df: DataFrame) -> DataFrame:
    """Add missing columns from ref_df to df

    Args:
        df (DataFrame): dataframe with missing columns
        ref_df (DataFrame): referential dataframe

    Returns:
        DataFrame: df with additionnal columns from ref_df
    """
    for col in ref_df.schema:
        if col.name not in df.columns:
            df = df.withColumn(col.name, F.lit(None).cast(col.dataType))

    return df



### Adding missing columns to dataframe1, df1 and dataframe2, df2

In [51]:
df1 = add_missing_columns(df1, df2)
df2 = add_missing_columns(df2, df1)

df_result = df1.unionByName(df2)
df_result.printSchema()
df_result.show()

root
 |-- age: long (nullable = true)
 |-- dept: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- state: string (nullable = true)

+----+---------+-------+------+-----+
| age|     dept|   name|salary|state|
+----+---------+-------+------+-----+
|  34|    Sales|  James|  null| null|
|  56|    Sales|Michael|  null| null|
|  34|    Sales| Robert|  null| null|
|  24|  Finance|  Maria|  null| null|
|null|    Sales|  James|  9000|   NY|
|null|  Finance|  Maria|  9000|   CA|
|null|  Finance|    Jen|  7900|   NY|
|null|Marketing|   Jeff|  8000|   CA|
+----+---------+-------+------+-----+



### PySpark Merge Two DataFrames with Different Columns
Harmonize dataframes

In [52]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

#Create DataFrame df1 with columns name,dept & age
data = [("James","Sales",34), ("Michael","Sales",56), \
    ("Robert","Sales",30), ("Maria","Finance",24) ]
columns= ["name","dept","age"]
df1 = spark.createDataFrame(data = data, schema = columns)
df1.printSchema()

#Create DataFrame df1 with columns name,dep,state & salary
data2=[("James","Sales","NY",9000),("Maria","Finance","CA",9000), \
    ("Jen","Finance","NY",7900),("Jeff","Marketing","CA",8000)]
columns2= ["name","dept","state","salary"]
df2 = spark.createDataFrame(data = data2, schema = columns2)
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- age: long (nullable = true)

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)



Now **add missing columns** ‘state‘ and ‘salary‘ to df1 and ‘age‘ to df2 with null values.

In [53]:
#Add missing columns 'state' & 'salary' to df1
from pyspark.sql.functions import lit
for column in [column for column in df2.columns if column not in df1.columns]:
    df1 = df1.withColumn(column, lit(None))

df1.printSchema()    
    
#Add missing column 'age' to df2
for column in [column for column in df1.columns if column not in df2.columns]:
    df2 = df2.withColumn(column, lit(None))
df2.printSchema()  
    
#Finally join two dataframe's df1 & df2 by name
merged_df=df1.unionByName(df2)
merged_df.show()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- age: long (nullable = true)
 |-- state: null (nullable = true)
 |-- salary: null (nullable = true)

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: null (nullable = true)

+-------+---------+----+-----+------+
|   name|     dept| age|state|salary|
+-------+---------+----+-----+------+
|  James|    Sales|  34| null|  null|
|Michael|    Sales|  56| null|  null|
| Robert|    Sales|  30| null|  null|
|  Maria|  Finance|  24| null|  null|
|  James|    Sales|null|   NY|  9000|
|  Maria|  Finance|null|   CA|  9000|
|    Jen|  Finance|null|   NY|  7900|
|   Jeff|Marketing|null|   CA|  8000|
+-------+---------+----+-----+------+

