In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, coalesce

# Base table (customer)
base_data = [
    ("001", "TESTNAME11", "KVAL1", 34, "9112345"),
    ("002", "TESTNAME12", "KVAL2", 35, "8112345"),
    ("003", "TESTNAME13", "KVAL3", 36, "+009112345"),
    ("004", "TESTNAME14", "KVAL4", 37, "+009112346"),
    ("005", "TESTNAME15", "KVAL5", 38, "+009112347"),
]
base_cols = ["ID", "NAME", "KEYVAL", "AGE", "PHONE"]
df_base = spark.createDataFrame(base_data, base_cols)

# Change file (input.txt)
change_data = [
    ("001", "TESTNAME011", None, None, None),
    ("002", None, "KVAL22", None, None),
    ("006", "TESTNAME16", "KVAL6", 36, "+009112348"),
]
df_change = spark.createDataFrame(change_data, base_cols)

print("Base Table:")
df_base.show()
df_base.createOrReplaceTempView("base")

print("Change File:")
df_change.show()
df_change.createOrReplaceTempView("change")

# ✅ Perform full outer join on ID


In [0]:
merged_df = df_base.alias('b').join(df_change.alias('c'),"id","full")
merged_df.select("id", coalesce(col("c.name"),col("b.name")).alias('name'), 
                 coalesce (col("c.keyval"),col("b.keyval")).alias("keyval"),
                 coalesce(col("c.age"),col("b.age")).alias("age"),
                 coalesce(col("c.phone"),col("b.phone"))).show()

In [0]:

merged = df_base.alias("b").join(df_change.alias("c"), on="ID", how="outer")


# ✅ Coalesce (prefer change file if not null, else base value)
df_final = merged.select(
    col("ID"),
    coalesce(col("c.NAME"), col("b.NAME")).alias("NAME"),
    coalesce(col("c.KEYVAL"), col("b.KEYVAL")).alias("KEYVAL"),
    coalesce(col("c.AGE"), col("b.AGE")).alias("AGE"),
    coalesce(col("c.PHONE"), col("b.PHONE")).alias("PHONE")
)

print("Final Updated Table:")
df_final.show()