In [0]:
from pyspark.sql import functions as F
from datetime import datetime

In [0]:
def func_scd2(target,source,key_cols):
  
    scd_cols = ["start_date","end_date","is_current"]

    # Identify the columns which are not key or scd columns 
    hash_columns = list(set(target.columns) - set(key_cols) - set(scd_cols))
    print(hash_columns)

    # Those columns will help us to identify the insert's and updates
    source = source.withColumn('key',F.concat_ws('',*key_cols))\
                   .withColumn("hash",F.sha2(F.concat_ws("||",*hash_columns),256))

    target = target.withColumn('key',F.concat_ws('',*key_cols))\
                   .withColumn("hash",F.sha2(F.concat_ws("||",*hash_columns),256))               

    # Process to identify the inserts (via key) and updates (via hash)
    inserts = source.alias('src').join(target.alias('trg').where('trg.is_current = True'), on='key', how='left')\
                                 .where('trg.key is null')

    updates = source.alias('src').join(target.alias('trg').where('trg.is_current = True'), on = 'key', how='inner')\
                                .where('src.hash <> trg.hash')


    # Based on this column "mergekey" we will perfrom the MERGE operation
    inserts = inserts.selectExpr("NULL as mergekey","src.*")
    updates = updates.selectExpr("key as mergekey","src.*")


    # Union the inserts and updates
    final_df_before_ingestion = inserts.unionByName(updates)


    # We are manufacturing with parametric way all columns with their values
    mergeSet = {
      "active": True,
      "from": datetime.now().strftime("%d-%b-%Y"),
      "to": "null",
      "key": "src.key",
      "hash": "src.hash"
    }

    for col in hash_columns:
      mergeSet[col] = 'src.'+col
    for col in key_cols:
      mergeSet[col] = 'src.'+col

    # MERGE: target - final_df_before_ingestion
    target.alias('trg').merge(
        final_df_before_ingestion.alias('src'),
        "trg.key = src.mergekey"
    )\
    .whenMatchedUpdate(
        condition = "trg.is_current = True",
        set = {
            "is_current": False,
            "to": datetime.now().strftime("%d-%b-%Y")
    }
    )\
    .whenNotMatchedInsert(
        values = mergeSet
  ).execute()

    
    