In [10]:
from util import *
import pyspark.sql.functions as f
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

initial_data_file = "C:/Users/parth/Desktop/workspace/data_pipeline/resources/sample_data/customers.csv"
updated_data_file = "C:/Users/parth/Desktop/workspace/data_pipeline/resources/sample_data/customers_incremental.csv"

output_path = "C:/Users/parth/Desktop/workspace/data_pipeline/output/customers_incremental_with_history"
table_name = "customers_incremental_with_history"
primary_key_field = "id"
parition_field = "date"
precombine_field = "updated_date"
spark_write_mode = "append"
parition_from = "updated_date"
hudi_options = get_incremental_options(table_name, primary_key_field, parition_field, precombine_field)
hudi_options

{'hoodie.table.name': 'customers_incremental_with_history',
 'hoodie.datasource.write.recordkey.field': 'id',
 'hoodie.datasource.write.partitionpath.field': 'date',
 'hoodie.datasource.write.table.name': 'customers_incremental_with_history',
 'hoodie.datasource.write.operation': 'upsert',
 'hoodie.datasource.write.precombine.field': 'updated_date'}

In [11]:
import os
import shutil

if os.path.exists(output_path):
    shutil.rmtree(output_path)

In [12]:
spark = get_spark_with_hudi()

In [13]:
source_df = spark.read.option("header", "true").option("inferSchema", "true").format("csv").load(initial_data_file)

timestamp_columns = ["created_date", "updated_date"]

w_create_ts = "2023-01-01 00:00:00"

for column in timestamp_columns:
    source_df = source_df.withColumn(column, f.col(column).cast("timestamp"))

processed_df = (source_df.withColumn("date", f.date_format(f.col(parition_from), "yyyyMMdd"))
    .withColumn("w_create_ts", f.lit(w_create_ts).cast("timestamp")))

In [14]:
processed_df.write.format("hudi").options(**hudi_options).mode(spark_write_mode).save(output_path)

In [15]:
target_df = (spark.read.format("hudi")
    .load(output_path))

target_df.select("id", "name", "email", "created_date", "updated_date", "w_create_ts").toPandas().head(10)

Unnamed: 0,id,name,email,created_date,updated_date,w_create_ts
0,2,Brocky Spurret,bspurret1@npr.org,2023-01-01 05:35:00,2023-01-01 05:35:00,2023-01-01
1,1,Jammie McCamish,jmccamish0@devhub.com,2023-01-01 03:17:00,2023-01-01 03:17:00,2023-01-01


In [16]:
source_df = spark.read.option("header", "true").option("inferSchema", "true").format("csv").load(updated_data_file)

timestamp_columns = ["created_date", "updated_date"]

w_create_ts = "2023-01-02 00:00:00"

for column in timestamp_columns:
    source_df = source_df.withColumn(column, f.col(column).cast("timestamp"))

processed_df = (source_df.withColumn("date", f.date_format(f.col(parition_from), "yyyyMMdd"))
    .withColumn("w_create_ts", f.lit(w_create_ts).cast("timestamp")))

In [17]:
processed_df.write.format("hudi").options(**hudi_options).mode(spark_write_mode).save(output_path)

In [20]:
target_df = (spark.read.format("hudi")
    .load(output_path))

target_df.select("id", "name", "email", "created_date", "updated_date", "w_create_ts").toPandas().head(10)

+---+---------------+--------------------+-------------------+-------------------+-------------------+---------+
| id|           name|               email|       created_date|       updated_date|        w_create_ts|is_active|
+---+---------------+--------------------+-------------------+-------------------+-------------------+---------+
|  2| Brocky Spurret|   bspurret1@npr.org|2023-01-01 05:35:00|2023-01-01 05:35:00|2023-01-01 00:00:00|     true|
|  1|Jammie McCamish|jmccamish0@devhub...|2023-01-01 03:17:00|2023-01-01 03:17:00|2023-01-01 00:00:00|     true|
|  1|Jammie McCamish|jmccamish0@develo...|2023-01-01 03:17:00|2023-01-02 03:17:00|2023-01-02 00:00:00|     true|
+---+---------------+--------------------+-------------------+-------------------+-------------------+---------+

