# Incremental data load

Even though update data set incrementally is not very often, we still need to consider this kind of event.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, coalesce, expr

In [2]:
# Create a Spark session
spark = SparkSession.builder.master("local").appName("IncrementalProcessing").getOrCreate()

23/11/24 09:28:30 WARN Utils: Your hostname, pengfei-Virtual-Machine resolves to a loopback address: 127.0.1.1; using 10.50.2.80 instead (on interface eth0)
23/11/24 09:28:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/11/24 09:28:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", IntegerType(), True)
])


# Create the initial DataFrame
initial_data = [
    (1, "Alice", 30, 50000),
    (2, "Bob", 35, 60000),
    (3, "Carol", 28, 45000)
]

df = spark.createDataFrame(initial_data, schema=schema)

# Show the initial DataFrame
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+-----+---+------+
| id| name|age|salary|
+---+-----+---+------+
|  1|Alice| 30| 50000|
|  2|  Bob| 35| 60000|
|  3|Carol| 28| 45000|
+---+-----+---+------+


                                                                                

In [5]:
# Define the schema for incremental data
schema_incremental = StructType([
    StructField("inc_id", IntegerType(), True),
    StructField("inc_name", StringType(), True),
    StructField("inc_age", IntegerType(), True),
    StructField("inc_salary", IntegerType(), True)
])

# Create another DataFrame with incremental data (including new and changed records)
incremental_data = [
    (1, "Alice", 30, 52000),  # Updated salary for Alice
    (2, "Bob", 29, 65000),  # Updated salary for Bob
    (3, "Hari", 28, 47000),  # Updated salary for Carol
    (4, "Dave", 40, 70000)  # New record for Dave
]

# Create a new DataFrame with the incremental data
incremental_df = spark.createDataFrame(incremental_data, schema=schema_incremental)

# Show the incremental DataFrame
incremental_df.show()

+------+--------+-------+----------+
|inc_id|inc_name|inc_age|inc_salary|
+------+--------+-------+----------+
|     1|   Alice|     30|     52000|
|     2|     Bob|     29|     65000|
|     3|    Hari|     28|     47000|
|     4|    Dave|     40|     70000|
+------+--------+-------+----------+


## Step1: Find new added record and changed record

In the Incremental data frame, we need to distinguish the type of records:
- changed_records: existing record in the origin dataset which will be updated by the new dataset
- new_records: non-existing record in the origin dataset which will be added by the new dataset

### 1.1 Identify changed records 

To identify the changed records, we need to use the primary key of the dataframe. In this tutorial, the primary key is the column `id`.

We do an inner join (because the record must exist in both origin and incremental dataset), then filter the rows which other columns have been changed.

> The origin and the incremental dataset must have the same primary key, otherwise it's impossible to do the incremental update.



In [6]:
changed_records = df.join(incremental_df, col("id") == col("inc_id"), 'inner').filter(
    (df.salary != incremental_df.inc_salary) |
    (df.name != incremental_df.inc_name) |
    (df.age != incremental_df.inc_age)
).select("inc_id", "inc_name", "inc_age", "inc_salary")

In [7]:
# Show the changed records
print("Changed Records:")
changed_records.show()

Changed Records:
+------+--------+-------+----------+
|inc_id|inc_name|inc_age|inc_salary|
+------+--------+-------+----------+
|     1|   Alice|     30|     52000|
|     2|     Bob|     29|     65000|
|     3|    Hari|     28|     47000|
+------+--------+-------+----------+


### 1.2 Identify new records 

To identify the new records, we need to use the primary key of the dataframe. In this tutorial, the primary key is the column `id`.

As the new record primary key(id) exists only in the incremental df not in the origin df, so we need to do a left join on the origin dataset and filter the id==null row.

In [8]:
# Identify new records in the incremental data
new_records = incremental_df.join(df, col("id") == col("inc_id"), 'left_outer').filter(df.id.isNull()).select("inc_id", "inc_name","inc_age","inc_salary")

# Show the new records
print("New Records:")
new_records.show()

New Records:
+------+--------+-------+----------+
|inc_id|inc_name|inc_age|inc_salary|
+------+--------+-------+----------+
|     4|    Dave|     40|     70000|
+------+--------+-------+----------+


## 2. Update the origin records

There are two steps:
- update the changed records value into the origin dataset
- add the new records into the origin dataset  

### 2.1 Update the changed records value

In [10]:
# Update the existing records in the original DataFrame with the changes
updated_df = df.join(
    changed_records.selectExpr("inc_id as id", "inc_name ", "inc_age", "inc_salary"), on='id', how='left_outer').coalesce(1). \
    withColumn("name", coalesce(col("inc_name"), col("name"))). \
    withColumn("age", coalesce(col("inc_age"), col("age"))). \
    withColumn("salary", coalesce(col("inc_salary"), col("salary"))). \
    drop("inc_name", "inc_age", "inc_salary")

# for each row, if the incremental dataset columns has a non-null value, update the origin dataset column value with this value. Otherwise, conserve the origin value.
# Show the updated DataFrame
print("Updated DataFrame:")
updated_df.show()


Updated DataFrame:
+---+-----+---+------+
| id| name|age|salary|
+---+-----+---+------+
|  1|Alice| 30| 52000|
|  3| Hari| 28| 47000|
|  2|  Bob| 29| 65000|
+---+-----+---+------+


## Add the new records 

Now we only need to add the new records to build the final results.

In [11]:
# Combine the updated DataFrame with the new records
final_df = updated_df.union(new_records)

# Show the final DataFrame
print("Final DataFrame:")


Final DataFrame:


In [12]:
final_df.show()

                                                                                

+---+-----+---+------+
| id| name|age|salary|
+---+-----+---+------+
|  1|Alice| 30| 52000|
|  3| Hari| 28| 47000|
|  2|  Bob| 29| 65000|
|  4| Dave| 40| 70000|
+---+-----+---+------+


In [13]:
# Stop the Spark session
spark.stop()