Challenge extracted from https://blog.devgenius.io/advanced-sql-interview-scd2-changes-ac3535c5ae65

<span style="color:blue;">

## Problem Statement
 What's changed?
Assume we have 2 tables one Source and Target , find the new records and updated records from source to target . 
Give 2 different solutions using below methods:
1. LEFT JOIN
2. HASHING

</span>

In [2]:
import sqlite3
conn = sqlite3.connect(':memory:')
sql = conn.cursor()

In [8]:
# Table Design and Sample Data
sql.execute('''
CREATE TABLE IF NOT EXISTS 
SourceEmployee (
    EmployeeID INT,
    Name STRING(100),
    Department STRING(50),
    Salary DECIMAL(10,2),
    EffectiveStartDate DATE
)
''')
sql.execute('''
CREATE TABLE IF NOT EXISTS 
TargetEmployee (
    EmployeeID INT,
    Name STRING(100),
    Department STRING(50),
    Salary DECIMAL(10,2),
    EffectiveStartDate DATE,
    EffectiveEndDate DATE,
    IsCurrent BOOL
)
''')

#-- Insert data into SourceEmployee
sql.execute("DELETE FROM SourceEmployee")
sql.execute("""
INSERT INTO SourceEmployee (EmployeeID, Name, Department, Salary, EffectiveStartDate)
VALUES 
(1, 'Mighty Mike', 'IT', 75000.00,'2023-11-01'),
(2, 'Shawn the Sheep', 'HR', 65000.00, '2023-06-01'),
(3, 'Sheriff Lambrador', 'Sales', 70000.00, '2024-07-01'),
(4, 'Ricky Zoom', 'Marketing', 70000.00,'2024-10-05'),
(5, 'Paw Patrol', 'Finance', 85000.00,'2024-10-01')
""")

#-- Insert data into TargetEmployee
sql.execute("DELETE FROM TargetEmployee")
sql.execute("""
INSERT INTO TargetEmployee (EmployeeID, Name, Department, Salary, EffectiveStartDate, EffectiveEndDate, IsCurrent)
VALUES 
(1, 'Might Mike', 'IT', 70000.00, '2024-11-01', NULL, TRUE),
(2, 'Shawn the Sheep', 'HR', 65000.00, '2023-06-01', NULL, TRUE),
(3, 'Sheriff Lamrador', 'Marketing', 75000.00, '2023-07-01', NULL, TRUE),
(4, 'Ricky Zoom', 'Marketing', 78000.00, '2023-10-05', NULL, TRUE)
""")


conn.commit()  #Remember to commit!!! :-)


In [16]:
# Check the data just loaded
import pandas as pd
print(pd.read_sql_query("SELECT * FROM SourceEmployee", conn))
pd.read_sql_query("SELECT * FROM TargetEmployee", conn)


   EmployeeID               Name Department  Salary EffectiveStartDate
0           1        Mighty Mike         IT   75000         2023-11-01
1           2    Shawn the Sheep         HR   65000         2023-06-01
2           3  Sheriff Lambrador      Sales   70000         2024-07-01
3           4         Ricky Zoom  Marketing   70000         2024-10-05
4           5         Paw Patrol    Finance   85000         2024-10-01


Unnamed: 0,EmployeeID,Name,Department,Salary,EffectiveStartDate,EffectiveEndDate,IsCurrent
0,1,Might Mike,IT,70000,2024-11-01,,1
1,2,Shawn the Sheep,HR,65000,2023-06-01,,1
2,3,Sheriff Lamrador,Marketing,75000,2023-07-01,,1
3,4,Ricky Zoom,Marketing,78000,2023-10-05,,1


In [24]:
# The solution using LEFT JOIN method

challenge_3_solution_LEFT_JOIN = pd.read_sql_query("""
SELECT 
    CASE 
        WHEN TargetEmployee.EmployeeID IS NULL THEN "The source record is a NEW one"
        ELSE "The source record is an UPDATED one"
    END                       AS SourceRecordStatus
    ,SourceEmployee.*
from 
SourceEmployee 
LEFT OUTER JOIN TargetEmployee TargetEmployee ON (TargetEmployee.EmployeeID = SourceEmployee.EmployeeID AND TargetEmployee.IsCurrent)
WHERE 
    NOT (                                                      --> UPDATED Records
        SourceEmployee.Name       = TargetEmployee.Name
    AND SourceEmployee.Department = TargetEmployee.Department
    AND SourceEmployee.Salary     = TargetEmployee.Salary
    AND SourceEmployee.EffectiveStartDate     = TargetEmployee.EffectiveStartDate
    )
OR
    (TargetEmployee.EmployeeID IS NULL)                        -- > NEW RECORDS
""", conn)

challenge_3_solution_LEFT_JOIN

Unnamed: 0,SourceRecordStatus,EmployeeID,Name,Department,Salary,EffectiveStartDate
0,The source record is an UPDATED one,1,Mighty Mike,IT,75000,2023-11-01
1,The source record is an UPDATED one,3,Sheriff Lambrador,Sales,70000,2024-07-01
2,The source record is an UPDATED one,4,Ricky Zoom,Marketing,70000,2024-10-05
3,The source record is a NEW one,5,Paw Patrol,Finance,85000,2024-10-01


In [66]:
# The solution using the HASHING method


# SQLITE3 seems to NOT HAVE any built-in HASHING functions :-(
#
# ----> We need to create a custom hashing function 
#  
# Python's power is amazing! It's so easy to have an augmented SQLite3 and ... create a My_FINGERPRINT
# I used sha256_hash:
# +++ the chance of a collision is very very low (64 hexadecimal characters)
# --- being cryptographic, it could significantly slow down and consume many resources with a high number of records
import hashlib
def sha256_hash(text):
    if text is None:
        return None
    return hashlib.sha256(text.encode('utf-8')).hexdigest()
conn.create_function("My_FINGERPRINT", 1, sha256_hash)


challenge_3_solution_HASHING = pd.read_sql_query("""
SELECT 
    CASE 
        WHEN TargetEmployee.EmployeeID IS NULL THEN "The source record is a NEW one"
        ELSE "The source record is an UPDATED one"
    END                       AS SourceRecordStatus
    ,SourceEmployee.*
from 
SourceEmployee 
LEFT OUTER JOIN TargetEmployee ON (TargetEmployee.EmployeeID = SourceEmployee.EmployeeID AND TargetEmployee.IsCurrent)
WHERE 
    My_FINGERPRINT(
       COALESCE(SourceEmployee.Name,'-') || COALESCE(SourceEmployee.Department,'-')
    || COALESCE(CAST(SourceEmployee.Salary AS TEXT), '-') 
    || COALESCE(CAST(SourceEmployee.EffectiveStartDate AS TEXT), '-')
    )
    <>  -- > the source record's fingerprint is different from the target record's fingerprint.
        -- > Note that using the COALESCE also the target "all null" record has a fingerprint: 
        -- >    it means that the <OR TargetEmployee.EmployeeID IS NULL> condition is not needed  
    My_FINGERPRINT(  
       COALESCE(TargetEmployee.Name,'-') || COALESCE(TargetEmployee.Department,'-')
    || COALESCE(CAST(TargetEmployee.Salary AS TEXT), '-') 
    || COALESCE(CAST(TargetEmployee.EffectiveStartDate AS TEXT), '-')
    )
""", conn)

challenge_3_solution_HASHING

Unnamed: 0,SourceRecordStatus,EmployeeID,Name,Department,Salary,EffectiveStartDate
0,The source record is an UPDATED one,1,Mighty Mike,IT,75000,2023-11-01
1,The source record is an UPDATED one,3,Sheriff Lambrador,Sales,70000,2024-07-01
2,The source record is an UPDATED one,4,Ricky Zoom,Marketing,70000,2024-10-05
3,The source record is a NEW one,5,Paw Patrol,Finance,85000,2024-10-01
