In [11]:
""" 
    SCD 2
    Assuming there are no logical or physical deletes, only new and updates are considered with usual versioning logic

    1 - Mark Y to N if there comes any update for existing rec
    2 - Insert new rec for existing rec if match is found based on hash
    3 - Insert only new rec using left_anti join
    4 - Select rows which are only in target (we have to keep untouched records as is in the target table)
    5 - Union 1, 2, 3 and 4
"""

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

src_df = spark.createDataFrame([
    ("1", "Josh","2025-01-02")
], ["id", "name", "src_dt"])

trgt_df = spark.createDataFrame([
    ("", "", "", "", "")
], ["id", "name", "is_active", "eff_start_dt", "eff_end_dt"])

src_df = src_df.withColumn("id", col("id").cast("int")).withColumn("src_hash", hash("id", "name"))
trgt_df = trgt_df.withColumn("id", col("id").cast("int")).withColumn("trgt_hash", hash("id", "name"))

src_df.show()
trgt_df.show()

+---+----+----------+----------+
| id|name|    src_dt|  src_hash|
+---+----+----------+----------+
|  1|Josh|2025-01-02|-850647380|
+---+----+----------+----------+

+----+----+---------+------------+----------+---------+
|  id|name|is_active|eff_start_dt|eff_end_dt|trgt_hash|
+----+----+---------+------------+----------+---------+
|NULL|    |         |            |          |142593372|
+----+----+---------+------------+----------+---------+



In [28]:
existing_only_in_trgt = trgt_df.join(src_df, src_df["id"]==trgt_df["id"], "left_anti")


### mark only active records to inactive first
active_to_inactive = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("is_active") == "Y")\
    .withColumn("is_active", lit("N")).withColumn("eff_end_dt", col("src_dt"))\
    .select(trgt_df["id"], trgt_df["name"], "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash")
active_to_inactive.show()

active_for_existing = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("src_hash") != col("trgt_hash"))\
    .select(src_df["id"], src_df["name"], lit("Y").alias("is_active"), col("src_dt").alias("eff_start_dt"), lit("9999-12-31").alias("eff_end_dt"), "src_hash")
active_for_existing.show()

new_rec = src_df.join(trgt_df, src_df["id"]==trgt_df["id"], "left_anti")\
    .select(src_df["id"], src_df["name"], lit("Y").alias("is_active"), col("src_dt").alias("eff_start_dt"), lit("9999-12-31").alias("eff_end_dt"), "src_hash")
new_rec.show()

final_data = active_to_inactive.union(existing_only_in_trgt).union(active_for_existing).union(new_rec).dropDuplicates()
final_data.show()

                                                                                

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt| trgt_hash|
+---+----+---------+------------+----------+----------+
|  1|Josh|        N|  2025-01-02|2025-01-03|-850647380|
+---+----+---------+------------+----------+----------+



                                                                                

+---+------+---------+------------+----------+-----------+
| id|  name|is_active|eff_start_dt|eff_end_dt|   src_hash|
+---+------+---------+------------+----------+-----------+
|  1|Joshua|        Y|  2025-01-03|9999-12-31|-1131722944|
+---+------+---------+------------+----------+-----------+



                                                                                

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt|  src_hash|
+---+----+---------+------------+----------+----------+
|  2|John|        Y|  2025-01-03|9999-12-31|-433068882|
+---+----+---------+------------+----------+----------+



                                                                                

+---+------+---------+------------+----------+-----------+
| id|  name|is_active|eff_start_dt|eff_end_dt|  trgt_hash|
+---+------+---------+------------+----------+-----------+
|  1|  Josh|        N|  2025-01-02|2025-01-03| -850647380|
|  1|Joshua|        Y|  2025-01-03|9999-12-31|-1131722944|
|  2|  John|        Y|  2025-01-03|9999-12-31| -433068882|
+---+------+---------+------------+----------+-----------+



In [27]:
src_df = spark.createDataFrame([
    ("1", "Joshua","2025-01-03"),
    ("2", "John","2025-01-03")
], ["id", "name", "src_dt"])
src_df = src_df.withColumn("id", col("id").cast("int")).withColumn("src_hash", hash("id", "name"))
trgt_df = spark.createDataFrame([
    (1, "Josh", "Y", "2025-01-02", "9999-12-31", -850647380)
], ["id", "name", "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash"])

src_df.show()
trgt_df.show()

existing_only_in_trgt = trgt_df.join(src_df, src_df["id"]==trgt_df["id"], "left_anti")

### mark only active records to inactive first
active_to_inactive = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter((col("src_hash") != col("trgt_hash")) & (col("is_active") == "Y"))\
    .withColumn("is_active", lit("N")).withColumn("eff_end_dt", col("src_dt"))\
    .select(trgt_df["id"], trgt_df["name"], "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash")
active_to_inactive.show()

active_for_existing = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("src_hash") != col("trgt_hash"))\
    .select(src_df["id"], src_df["name"], lit("Y").alias("is_active"), col("src_dt").alias("eff_start_dt"), lit("9999-12-31").alias("eff_end_dt"), "src_hash")\
    .dropDuplicates()
active_for_existing.show()

new_rec = src_df.join(trgt_df, src_df["id"]==trgt_df["id"], "left_anti")\
    .select(src_df["id"], src_df["name"], lit("Y").alias("is_active"), col("src_dt").alias("eff_start_dt"), lit("9999-12-31").alias("eff_end_dt"), "src_hash")
new_rec.show()

final_data = active_to_inactive.union(existing_only_in_trgt).union(active_for_existing).union(new_rec).union(trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left").filter(col("is_active") == "N").select(trgt_df["id"], trgt_df["name"], "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash") \
                                                                                                            ).dropDuplicates()
final_data.show()


+---+------+----------+-----------+
| id|  name|    src_dt|   src_hash|
+---+------+----------+-----------+
|  1|Joshua|2025-01-03|-1131722944|
|  2|  John|2025-01-03| -433068882|
+---+------+----------+-----------+

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt| trgt_hash|
+---+----+---------+------------+----------+----------+
|  1|Josh|        Y|  2025-01-02|9999-12-31|-850647380|
+---+----+---------+------------+----------+----------+



                                                                                

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt| trgt_hash|
+---+----+---------+------------+----------+----------+
|  1|Josh|        N|  2025-01-02|2025-01-03|-850647380|
+---+----+---------+------------+----------+----------+



                                                                                

+---+------+---------+------------+----------+-----------+
| id|  name|is_active|eff_start_dt|eff_end_dt|   src_hash|
+---+------+---------+------------+----------+-----------+
|  1|Joshua|        Y|  2025-01-03|9999-12-31|-1131722944|
+---+------+---------+------------+----------+-----------+



                                                                                

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt|  src_hash|
+---+----+---------+------------+----------+----------+
|  2|John|        Y|  2025-01-03|9999-12-31|-433068882|
+---+----+---------+------------+----------+----------+



                                                                                

+---+------+---------+------------+----------+-----------+
| id|  name|is_active|eff_start_dt|eff_end_dt|  trgt_hash|
+---+------+---------+------------+----------+-----------+
|  1|  Josh|        N|  2025-01-02|2025-01-03| -850647380|
|  1|Joshua|        Y|  2025-01-03|9999-12-31|-1131722944|
|  2|  John|        Y|  2025-01-03|9999-12-31| -433068882|
+---+------+---------+------------+----------+-----------+



In [30]:
src_df = spark.createDataFrame([
    (2, "Scott","2025-01-04"),
    (3, "Jake","2025-01-04")
], ["id", "name", "src_dt"])
src_df = src_df.withColumn("id", col("id").cast("int")).withColumn("src_hash", hash("id", "name"))
trgt_df = spark.createDataFrame([
    (1,"Josh","N","2025-01-02","2025-01-03",-850647380),
    (1,"Joshua","Y","2025-01-03","9999-12-31",-1131722944),
    (2,"John","Y","2025-01-03","9999-12-31",-433068882),
], ["id", "name", "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash"])


src_df.show()
trgt_df.show()

existing_only_in_trgt = trgt_df.join(src_df, src_df["id"]==trgt_df["id"], "left_anti")

### mark only active records to inactive first
active_to_inactive = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter((col("src_hash") != col("trgt_hash")) & (col("is_active") == "Y"))\
    .withColumn("is_active", lit("N")).withColumn("eff_end_dt", col("src_dt"))\
    .select(trgt_df["id"], trgt_df["name"], "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash")
active_to_inactive.show()

active_for_existing = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("src_hash") != col("trgt_hash"))\
    .select(src_df["id"], src_df["name"], lit("Y").alias("is_active"), col("src_dt").alias("eff_start_dt"), lit("9999-12-31").alias("eff_end_dt"), "src_hash")\
    .dropDuplicates()
active_for_existing.show()

new_rec = src_df.join(trgt_df, src_df["id"]==trgt_df["id"], "left_anti")\
    .select(src_df["id"], src_df["name"], lit("Y").alias("is_active"), col("src_dt").alias("eff_start_dt"), lit("9999-12-31").alias("eff_end_dt"), "src_hash")
new_rec.show()

final_data = active_to_inactive.union(existing_only_in_trgt).union(active_for_existing).union(new_rec). \
select(trgt_df["id"], trgt_df["name"], "is_active", "eff_start_dt", "eff_end_dt", "trgt_hash")
final_data.show()


+---+-----+----------+-----------+
| id| name|    src_dt|   src_hash|
+---+-----+----------+-----------+
|  2|Scott|2025-01-04|-2119877053|
|  3| Jake|2025-01-04| 1749065146|
+---+-----+----------+-----------+

+---+------+---------+------------+----------+-----------+
| id|  name|is_active|eff_start_dt|eff_end_dt|  trgt_hash|
+---+------+---------+------------+----------+-----------+
|  1|  Josh|        N|  2025-01-02|2025-01-03| -850647380|
|  1|Joshua|        Y|  2025-01-03|9999-12-31|-1131722944|
|  2|  John|        Y|  2025-01-03|9999-12-31| -433068882|
+---+------+---------+------------+----------+-----------+



                                                                                

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt| trgt_hash|
+---+----+---------+------------+----------+----------+
|  2|John|        N|  2025-01-03|2025-01-04|-433068882|
+---+----+---------+------------+----------+----------+



                                                                                

+---+-----+---------+------------+----------+-----------+
| id| name|is_active|eff_start_dt|eff_end_dt|   src_hash|
+---+-----+---------+------------+----------+-----------+
|  2|Scott|        Y|  2025-01-04|9999-12-31|-2119877053|
+---+-----+---------+------------+----------+-----------+



                                                                                

+---+----+---------+------------+----------+----------+
| id|name|is_active|eff_start_dt|eff_end_dt|  src_hash|
+---+----+---------+------------+----------+----------+
|  3|Jake|        Y|  2025-01-04|9999-12-31|1749065146|
+---+----+---------+------------+----------+----------+



                                                                                

+---+------+---------+------------+----------+-----------+
| id|  name|is_active|eff_start_dt|eff_end_dt|  trgt_hash|
+---+------+---------+------------+----------+-----------+
|  2|  John|        N|  2025-01-03|2025-01-04| -433068882|
|  1|  Josh|        N|  2025-01-02|2025-01-03| -850647380|
|  1|Joshua|        Y|  2025-01-03|9999-12-31|-1131722944|
|  2| Scott|        Y|  2025-01-04|9999-12-31|-2119877053|
|  3|  Jake|        Y|  2025-01-04|9999-12-31| 1749065146|
+---+------+---------+------------+----------+-----------+



In [31]:
"""
    Consider same data as above to implement SCD Type 1
    1 - Find matching rec from src and trg, and fully overwrite existing rec with incoming src rec
    2 - Use left_anti join to determine only new rec
    3 - Union 1, 2 
""" 

from pyspark.sql.functions import *
from pyspark.sql.types import *

src_df = spark.createDataFrame([
    ("1", "Josh","2025-01-02")
], ["id", "name", "src_dt"])

trgt_df = spark.createDataFrame([
    ("", "", "")
], ["id", "name", "trgt_dt"])

src_df = src_df.withColumn("id", col("id").cast("int")).withColumn("src_hash", hash("id", "name"))
trgt_df = trgt_df.withColumn("id", col("id").cast("int")).withColumn("trgt_hash", hash("id", "name"))

src_df.show()
trgt_df.show()

+---+----+----------+----------+
| id|name|    src_dt|  src_hash|
+---+----+----------+----------+
|  1|Josh|2025-01-02|-850647380|
+---+----+----------+----------+

+----+----+-------+---------+
|  id|name|trgt_dt|trgt_hash|
+----+----+-------+---------+
|NULL|    |       |142593372|
+----+----+-------+---------+



In [32]:
overwrite_for_existing = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("src_hash") != col("trgt_hash"))\
    .select(src_df["id"], src_df["name"], "src_dt" , "src_hash")\
    .dropDuplicates()
overwrite_for_existing.show()

new_rec = src_df.join(trgt_df, src_df["id"]==trgt_df["id"], "left_anti")\
    .select(src_df["id"], src_df["name"], col("src_dt").alias("trgt_dt"), "src_hash")
new_rec.show()

final_data = overwrite_for_existing.union(new_rec)
final_data.show()



+---+----+------+--------+
| id|name|src_dt|src_hash|
+---+----+------+--------+
+---+----+------+--------+



                                                                                

+---+----+----------+----------+
| id|name|   trgt_dt|  src_hash|
+---+----+----------+----------+
|  1|Josh|2025-01-02|-850647380|
+---+----+----------+----------+



[Stage 780:>                                                        (0 + 4) / 4]

+---+----+----------+----------+
| id|name|    src_dt|  src_hash|
+---+----+----------+----------+
|  1|Josh|2025-01-02|-850647380|
+---+----+----------+----------+



                                                                                

In [33]:
src_df = spark.createDataFrame([
    ("1", "Joshua","2025-01-03")
], ["id", "name", "src_dt"])
src_df = src_df.withColumn("id", col("id").cast("int")).withColumn("src_hash", hash("id", "name"))
trgt_df = spark.createDataFrame([
    (1, "Josh",  "2025-01-02", -850647380)
], ["id", "name", "trgt_dt", "trgt_hash"])

src_df.show()
trgt_df.show()

+---+------+----------+-----------+
| id|  name|    src_dt|   src_hash|
+---+------+----------+-----------+
|  1|Joshua|2025-01-03|-1131722944|
+---+------+----------+-----------+

+---+----+----------+----------+
| id|name|   trgt_dt| trgt_hash|
+---+----+----------+----------+
|  1|Josh|2025-01-02|-850647380|
+---+----+----------+----------+



In [34]:
overwrite_for_existing = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("src_hash") != col("trgt_hash"))\
    .select(src_df["id"], src_df["name"], "src_dt" , "src_hash")\
    .dropDuplicates()
overwrite_for_existing.show()

new_rec = src_df.join(trgt_df, src_df["id"]==trgt_df["id"], "left_anti")\
    .select(src_df["id"], src_df["name"], col("src_dt").alias("trgt_dt"), "src_hash")
new_rec.show()

final_data = overwrite_for_existing.union(new_rec)
final_data.show()

                                                                                

+---+------+----------+-----------+
| id|  name|    src_dt|   src_hash|
+---+------+----------+-----------+
|  1|Joshua|2025-01-03|-1131722944|
+---+------+----------+-----------+



                                                                                

+---+----+-------+--------+
| id|name|trgt_dt|src_hash|
+---+----+-------+--------+
+---+----+-------+--------+



[Stage 805:>                (0 + 4) / 4][Stage 808:>                (0 + 0) / 1]

+---+------+----------+-----------+
| id|  name|    src_dt|   src_hash|
+---+------+----------+-----------+
|  1|Joshua|2025-01-03|-1131722944|
+---+------+----------+-----------+



                                                                                

In [35]:
src_df = spark.createDataFrame([
    (2, "Scott","2025-01-04"),
    (1, "Jake","2025-01-04")
], ["id", "name", "src_dt"])
src_df = src_df.withColumn("id", col("id").cast("int")).withColumn("src_hash", hash("id", "name"))
trgt_df = spark.createDataFrame([
    (1, "Joshua",  "2025-01-03", -1131722944)
], ["id", "name", "trgt_dt", "trgt_hash"])

src_df.show()
trgt_df.show()

+---+-----+----------+-----------+
| id| name|    src_dt|   src_hash|
+---+-----+----------+-----------+
|  2|Scott|2025-01-04|-2119877053|
|  1| Jake|2025-01-04| -811822073|
+---+-----+----------+-----------+

+---+------+----------+-----------+
| id|  name|   trgt_dt|  trgt_hash|
+---+------+----------+-----------+
|  1|Joshua|2025-01-03|-1131722944|
+---+------+----------+-----------+



In [36]:
overwrite_for_existing = trgt_df.join(src_df, trgt_df["id"] == src_df["id"], "left")\
    .filter(col("src_hash") != col("trgt_hash"))\
    .select(src_df["id"], src_df["name"], "src_dt" , "src_hash")\
    .dropDuplicates()
overwrite_for_existing.show()

new_rec = src_df.join(trgt_df, src_df["id"]==trgt_df["id"], "left_anti")\
    .select(src_df["id"], src_df["name"], col("src_dt").alias("trgt_dt"), "src_hash")
new_rec.show()

final_data = overwrite_for_existing.union(new_rec)
final_data.show()

                                                                                

+---+----+----------+----------+
| id|name|    src_dt|  src_hash|
+---+----+----------+----------+
|  1|Jake|2025-01-04|-811822073|
+---+----+----------+----------+



                                                                                

+---+-----+----------+-----------+
| id| name|   trgt_dt|   src_hash|
+---+-----+----------+-----------+
|  2|Scott|2025-01-04|-2119877053|
+---+-----+----------+-----------+



                                                                                

+---+-----+----------+-----------+
| id| name|    src_dt|   src_hash|
+---+-----+----------+-----------+
|  1| Jake|2025-01-04| -811822073|
|  2|Scott|2025-01-04|-2119877053|
+---+-----+----------+-----------+

