In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
import pyspark.sql.functions as F

In [17]:
# Create a Spark session
spark = SparkSession.builder.appName("CustomerHistorization").getOrCreate()

# Simulate an initial load of customer data
initial_data = [(1, 'Alice', 'alice@example.com'),
                (2, 'Bob', 'bob@example.com'),
                (3, 'Charlie', 'charlie@example.com')]

columns = ["CustomerID", "Name", "Email"]

df = spark.createDataFrame(initial_data, columns)


# Simulate changes in the data (updates)
# TODO: Inner join between df and incoming_df
changes_data = [
    (1, 'Alice', 'alice.new@example.com'),
    (3, 'Chandler','charlie@example.com'),
]

changes_df = spark.createDataFrame(changes_data, columns)

# Load incremental data (e.g., new customers)
# TODO: remaining rows from incoming df
incremental_data = [(4, 'David', 'david@example.com'),
                    (5, 'Eva', 'eva@example.com')]

incremental_df = spark.createDataFrame(incremental_data, columns)


# Simulate deletions
deletions = [
    2,
]

# Create the update df

In [24]:
# Create a DataFrame for the update record

# Record deletion operations
update = (
    df
    .filter(F.col("CustomerID").isin(deletions))
    .withColumn("ChangeType", lit("Deletion"))
)

# Record modification operations
update = (
    update
    .unionByName(
        df
        .join(changes_df, ["CustomerID"], "leftsemi")
        .withColumn("ChangeType", lit("Modification"))   
    )
    
)

# Record addition operations (new row)
# All columns are not needed just the keys
copy_incremental_df = incremental_df

null_cols = [x for x in columns if x != "CustomerID"]
for col in null_cols:
    copy_incremental_df = copy_incremental_df.withColumn(col, F.lit(None))

update = (
    update
    .unionByName(
        copy_incremental_df
        .withColumn("ChangeType", lit("Addition"))
    )
    
)

update.show()

+----------+-------+-------------------+------------+
|CustomerID|   Name|              Email|  ChangeType|
+----------+-------+-------------------+------------+
|         2|    Bob|    bob@example.com|    Deletion|
|         1|  Alice|  alice@example.com|Modification|
|         3|Charlie|charlie@example.com|Modification|
|         4|   null|               null|    Addition|
|         5|   null|               null|    Addition|
+----------+-------+-------------------+------------+



# Perform the operations

# Reverse the step