In [40]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('demo').getOrCreate()
df1 = spark.read.option("delimiter",",").csv("input1.csv",header=True)
df1.show()
df2 = spark.read.option("delimiter",",").csv("input2.csv",header=True)
df2.show()

+-------+---+
|   Name|Age|
+-------+---+
| Naresh| 25|
|MichalC| 26|
|  Virat| 45|
|   Demo| 67|
|     Hi| 89|
+-------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Naresh| 25|     M|
|  Michal| 27|     F|
|   Virat| 29|     M|
|    Demo| 30|     F|
|      Hi| 40|     M|
|    Jump| 51|     F|
|    Mabu| 39|     M|
|Shaheena| 30|     F|
| Madhena| 40|     F|
+--------+---+------+



# Method 1: withColumn & lit & union

In [41]:
#df1.union(df2) #error union can perform same number of columns
from pyspark.sql.functions import lit
df_add = df1.withColumn("Gender",lit("null"))
df_add.show()
df_add.union(df2).show()

+-------+---+------+
|   Name|Age|Gender|
+-------+---+------+
| Naresh| 25|  null|
|MichalC| 26|  null|
|  Virat| 45|  null|
|   Demo| 67|  null|
|     Hi| 89|  null|
+-------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Naresh| 25|  null|
| MichalC| 26|  null|
|   Virat| 45|  null|
|    Demo| 67|  null|
|      Hi| 89|  null|
|  Naresh| 25|     M|
|  Michal| 27|     F|
|   Virat| 29|     M|
|    Demo| 30|     F|
|      Hi| 40|     M|
|    Jump| 51|     F|
|    Mabu| 39|     M|
|Shaheena| 30|     F|
| Madhena| 40|     F|
+--------+---+------+



#  Method 2: Schema & Union

In [47]:
from pyspark.sql.types import *
schema = StructType (
        [
            StructField("Name",StringType(),True),
            StructField("Age",StringType(),True),
            StructField("Gender",StringType(),True)
        ]
)

In [48]:
df3 = spark.read.option("delimiter",",").csv("input1.csv",header=True,schema=schema)

In [49]:
df4 =  spark.read.option("delimiter",",").csv("input2.csv",header=True,schema=schema)

In [50]:
df3.union(df4).show()

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Naresh| 25|  null|
| MichalC| 26|  null|
|   Virat| 45|  null|
|    Demo| 67|  null|
|      Hi| 89|  null|
|  Naresh| 25|     M|
|  Michal| 27|     F|
|   Virat| 29|     M|
|    Demo| 30|     F|
|      Hi| 40|     M|
|    Jump| 51|     F|
|    Mabu| 39|     M|
|Shaheena| 30|     F|
| Madhena| 40|     F|
+--------+---+------+



# Method 3: Outer Join

In [51]:
df5 = spark.read.option("delimiter",",").csv("input1.csv",header=True)

In [52]:
df6 =  spark.read.option("delimiter",",").csv("input2.csv",header=True)

In [53]:
df7= df5.join(df6,on=["Name", "Age"],how="Outer")
df7.show()

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|    Demo| 30|     F|
|    Demo| 67|  null|
|      Hi| 40|     M|
|      Hi| 89|  null|
|    Jump| 51|     F|
|    Mabu| 39|     M|
| Madhena| 40|     F|
|  Michal| 27|     F|
| MichalC| 26|  null|
|  Naresh| 25|     M|
|Shaheena| 30|     F|
|   Virat| 29|     M|
|   Virat| 45|  null|
+--------+---+------+



# 4. Automated Approach

In [54]:
df8 = spark.read.option("delimiter",",").csv("input1.csv",header=True)

In [55]:
df9 =  spark.read.option("delimiter",",").csv("input2.csv",header=True)

In [56]:
list1 = list(set(df8.columns)-set(df9.columns))
list2 = list(set(df9.columns)-set(df8.columns))

In [60]:
for i in list1:
    df9= df9.withColumn(i,lit("null"))
    
for j in list2:
    df8= df8.withColumn(j,lit("null"))

df8.union(df9).show()

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Naresh| 25|  null|
| MichalC| 26|  null|
|   Virat| 45|  null|
|    Demo| 67|  null|
|      Hi| 89|  null|
|  Naresh| 25|     M|
|  Michal| 27|     F|
|   Virat| 29|     M|
|    Demo| 30|     F|
|      Hi| 40|     M|
|    Jump| 51|     F|
|    Mabu| 39|     M|
|Shaheena| 30|     F|
| Madhena| 40|     F|
+--------+---+------+

