
## Auto Loader

https://learn.microsoft.com/en-us/azure/databricks/ingestion/auto-loader/

## Change data feed
https://docs.delta.io/latest/delta-change-data-feed.html


## Steps

1. Generate some csv
2. Read Stream and create bronze table
3. Enable change data feed on the bronze table
4. Inspect history of bronze table
5. Display content of bronze table
6. Display the content of change data feed and understand how it works
7. Make some changes on the table
8. Display and inspect the change data feed again
9. Create a pipeline functions to read the CDF, to transaformation and merge into a Silver Table
10. Create Silver Table Schema
11. Create ReadStream in the CDF and use foreach batch to apply the ETL pipeline
12. Create more records and run the Autoloader and the CDF stream code
13. Inspect the logs
14. Clear checkpoint of cdf stream
15. Clean Everything

In [0]:
# setup logs
import logging
import time
import datetime

logfile_prefix = 'incremental_autoload_cdf'
file_date = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d')
p_dir = '/tmp/'
p_file_name = f'{logfile_prefix}_{file_date}.log'
log_file = f'{p_dir}{p_file_name}'
logger = logging.getLogger('custom_log')
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(log_file, mode='a')
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(message)s')
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)
if logger.hasHandlers():
    logger.handlers.clear()

# add handlers

logger.addHandler(file_handler)
logger.addHandler(stream_handler)
logger.debug("starting logger")

DEBUG:2023-12-05 08:26:44,679:starting logger


In [0]:
# import packages

from pyspark.sql.functions import (lit,
                                   col, 
                                   row_number,
                                   concat,
                                   desc
)

from delta.tables import *
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

In [0]:
# load files using autoloader from a monted folder

checkpoint_path = "dbfs:/mnt/databricks/_checkpoint/customers_bronze"

(spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "csv")
  .option("cloudFiles.schemaLocation", checkpoint_path)
  .load("dbfs:/mnt/databricks/customers")
  .writeStream
  .option("checkpointLocation", checkpoint_path)
  .trigger(availableNow=True)
  .toTable("customers_bronze"))


<pyspark.sql.streaming.query.StreamingQuery at 0x7f0f2cf62740>

In [0]:
%sql
ALTER TABLE customers_bronze SET TBLPROPERTIES (delta.enableChangeDataFeed = true)


In [0]:

customers_deltaTable = DeltaTable.forName(spark, "customers_bronze")
fullHistoryDF = customers_deltaTable.history()    # get the full history of the table
lastOperationDF = customers_deltaTable.history(1) # get the last operation
fullHistoryDF.display()

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2023-12-05T08:44:30Z,2819532067442980,pedro.junqueira@agile-analytics.com.au,STREAMING UPDATE,"Map(outputMode -> Append, queryId -> 11954145-67ae-458c-9ebc-997169cce15c, epochId -> 1)",,List(762933502808922),1107-222207-i96cpubv,3.0,WriteSerializable,True,"Map(numRemovedFiles -> 0, numOutputRows -> 60, numOutputBytes -> 7951, numAddedFiles -> 3)",,Databricks-Runtime/13.3.x-photon-scala2.12
3,2023-12-05T08:32:59Z,2819532067442980,pedro.junqueira@agile-analytics.com.au,UPDATE,"Map(predicate -> [""(cast(id#1769 as int) = 1)""])",,List(762933502808922),1107-222207-i96cpubv,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 2619, numCopiedRows -> 19, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 1, executionTimeMs -> 3386, scanTimeMs -> 2261, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 2779, rewriteTimeMs -> 1104)",,Databricks-Runtime/13.3.x-photon-scala2.12
2,2023-12-05T08:30:20Z,2819532067442980,pedro.junqueira@agile-analytics.com.au,SET TBLPROPERTIES,"Map(properties -> {""delta.enableChangeDataFeed"":""true""})",,List(762933502808922),1107-222207-i96cpubv,1.0,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
1,2023-12-05T08:29:31Z,2819532067442980,pedro.junqueira@agile-analytics.com.au,STREAMING UPDATE,"Map(outputMode -> Append, queryId -> 11954145-67ae-458c-9ebc-997169cce15c, epochId -> 0)",,List(762933502808922),1107-222207-i96cpubv,0.0,WriteSerializable,True,"Map(numRemovedFiles -> 0, numOutputRows -> 20, numOutputBytes -> 2619, numAddedFiles -> 1)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2023-12-05T08:29:02Z,2819532067442980,pedro.junqueira@agile-analytics.com.au,CREATE TABLE,"Map(partitionBy -> [], description -> null, isManaged -> true, properties -> {}, statsOnLoad -> false)",,List(762933502808922),1107-222207-i96cpubv,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
customers_df = customers_deltaTable.toDF()
(customers_df
    .orderBy('id')
    .display())

id,first_name,last_name,email,city,_rescued_data
1,Sarah,Santiago,susanstark@example.org,Port Lauren,
10,Ashley,Young,jason26@example.net,Port Robert,
11,Michael,Smith,andrew67@example.net,Guerrachester,
12,Kenneth,Taylor,anthony97@example.net,Jamesmouth,
13,Derrick,Gordon,amanda84@example.org,Timothymouth,
14,Tina,Hopkins,kevin08@example.net,Acostaside,
15,William,Jones,hsmith@example.org,Dawnmouth,
16,Tony,Martinez,joshua70@example.net,New Dwaynemouth,
17,Nichole,Clark,sbarnett@example.org,Port Shawn,
18,Matthew,Shaw,qjohnson@example.org,West William,


In [0]:
# reading the delta change feed

# by version

cdf_version = (
    spark.read.format("delta")
    .option("readChangeFeed", "true")
    .option("startingVersion", 2)
    .table("customers_bronze")
)

# timestamp

# timestamps as formatted timestamp
cdf_stamp = (spark.read.format("delta")
  .option("readChangeFeed", "true")
  .option("startingTimestamp", '2023-12-05T08:30:20Z')
  .table("customers_bronze")
)

cdf_version.display()
cdf_stamp.display()

id,first_name,last_name,email,city,_rescued_data,_change_type,_commit_version,_commit_timestamp
1,Sarah,Spider,susanstark@example.org,Port Lauren,,update_postimage,3,2023-12-05T08:32:59Z
1,Sarah,Santiago,susanstark@example.org,Port Lauren,,update_preimage,3,2023-12-05T08:32:59Z
61,Jeffrey,Ward,csmith@example.net,Nataliefurt,,insert,4,2023-12-05T08:44:30Z
62,Nicole,Shepard,melanie60@example.net,Port Lorifort,,insert,4,2023-12-05T08:44:30Z
63,Francisco,Thomas,staceywest@example.org,Port Jessica,,insert,4,2023-12-05T08:44:30Z
64,Kaitlyn,Lang,susansanchez@example.com,Lake Kathyfurt,,insert,4,2023-12-05T08:44:30Z
65,Christopher,Bridges,stewartcharles@example.org,North Sarabury,,insert,4,2023-12-05T08:44:30Z
66,Samuel,Adams,collin19@example.com,Millerton,,insert,4,2023-12-05T08:44:30Z
67,Robert,Mason,joshuagraves@example.org,Andersonshire,,insert,4,2023-12-05T08:44:30Z
68,Kimberly,Sparks,melaniebrown@example.org,Edwardville,,insert,4,2023-12-05T08:44:30Z


id,first_name,last_name,email,city,_rescued_data,_change_type,_commit_version,_commit_timestamp
1,Sarah,Spider,susanstark@example.org,Port Lauren,,update_postimage,3,2023-12-05T08:32:59Z
1,Sarah,Santiago,susanstark@example.org,Port Lauren,,update_preimage,3,2023-12-05T08:32:59Z
61,Jeffrey,Ward,csmith@example.net,Nataliefurt,,insert,4,2023-12-05T08:44:30Z
62,Nicole,Shepard,melanie60@example.net,Port Lorifort,,insert,4,2023-12-05T08:44:30Z
63,Francisco,Thomas,staceywest@example.org,Port Jessica,,insert,4,2023-12-05T08:44:30Z
64,Kaitlyn,Lang,susansanchez@example.com,Lake Kathyfurt,,insert,4,2023-12-05T08:44:30Z
65,Christopher,Bridges,stewartcharles@example.org,North Sarabury,,insert,4,2023-12-05T08:44:30Z
66,Samuel,Adams,collin19@example.com,Millerton,,insert,4,2023-12-05T08:44:30Z
67,Robert,Mason,joshuagraves@example.org,Andersonshire,,insert,4,2023-12-05T08:44:30Z
68,Kimberly,Sparks,melaniebrown@example.org,Edwardville,,insert,4,2023-12-05T08:44:30Z


In [0]:
customers_bronze_delta = DeltaTable.forName(spark, 'customers_bronze')
(customers_bronze_delta
 .update(
     condition = col('id') == 1 ,
     set = {'last_name': lit('Spider')}
    )
 )

In [0]:
cdf_stamp = (spark.read.format("delta")
  .option("readChangeFeed", "true")
  .option("startingTimestamp", '2023-12-05T08:30:20Z')
  .table("customers_bronze")
)
cdf_stamp.display()

id,first_name,last_name,email,city,_rescued_data,_change_type,_commit_version,_commit_timestamp
1,Sarah,Spider,susanstark@example.org,Port Lauren,,update_postimage,3,2023-12-05T08:32:59Z
1,Sarah,Santiago,susanstark@example.org,Port Lauren,,update_preimage,3,2023-12-05T08:32:59Z
61,Jeffrey,Ward,csmith@example.net,Nataliefurt,,insert,4,2023-12-05T08:44:30Z
62,Nicole,Shepard,melanie60@example.net,Port Lorifort,,insert,4,2023-12-05T08:44:30Z
63,Francisco,Thomas,staceywest@example.org,Port Jessica,,insert,4,2023-12-05T08:44:30Z
64,Kaitlyn,Lang,susansanchez@example.com,Lake Kathyfurt,,insert,4,2023-12-05T08:44:30Z
65,Christopher,Bridges,stewartcharles@example.org,North Sarabury,,insert,4,2023-12-05T08:44:30Z
66,Samuel,Adams,collin19@example.com,Millerton,,insert,4,2023-12-05T08:44:30Z
67,Robert,Mason,joshuagraves@example.org,Andersonshire,,insert,4,2023-12-05T08:44:30Z
68,Kimberly,Sparks,melaniebrown@example.org,Edwardville,,insert,4,2023-12-05T08:44:30Z


In [0]:
# get only the last version of the records that are either inserted or updated
def cdf_id_last_version(df):
    filtered_df = (df 
        .filter(~col("_change_type").isin("delete", "update_preimage")) 
    )

    windowPartition = Window.partitionBy("id").orderBy(desc("_commit_timestamp"))

    ranked_df = (filtered_df 
        .withColumn("rnk", row_number().over(windowPartition))
    )

    result_df = ranked_df.filter(col("rnk") == 1)

    return result_df

In [0]:
last_version = cdf_id_last_version(cdf_stamp)
last_version.display()

id,first_name,last_name,email,city,_rescued_data,_change_type,_commit_version,_commit_timestamp,rnk
1,Sarah,Spider,susanstark@example.org,Port Lauren,,update_postimage,3,2023-12-05T08:32:59Z,1
100,Patrick,Green,swansonmelissa@example.org,East Stephen,,insert,4,2023-12-05T08:44:30Z,1
21,Dylan,Hayes,jtate@example.net,Hooperborough,,insert,4,2023-12-05T08:44:30Z,1
22,William,Avila,jimmyblackburn@example.org,Jayland,,insert,4,2023-12-05T08:44:30Z,1
23,Lisa,Adams,shenderson@example.org,Lake Joseph,,insert,4,2023-12-05T08:44:30Z,1
24,Laura,Hughes,jameswilliams@example.net,Pamstad,,insert,4,2023-12-05T08:44:30Z,1
25,Harold,Jackson,barry05@example.com,West Andrea,,insert,4,2023-12-05T08:44:30Z,1
26,Stephanie,Sims,wolfeanthony@example.net,West David,,insert,4,2023-12-05T08:44:30Z,1
27,Christopher,Wright,coxlouis@example.net,Leslietown,,insert,4,2023-12-05T08:44:30Z,1
28,Donald,Delgado,ylewis@example.org,Lake Patrickhaven,,insert,4,2023-12-05T08:44:30Z,1


In [0]:
# silver transformation
def silver_etl(df):
    return (df
        .withColumn('full_name',concat(col('first_name'),lit(' '), col('last_name')))
    )
    

In [0]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("full_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("city", StringType(), True),
    StructField("_rescued_data", StringType(), True)
])
empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
# Write the empty DataFrame as a Delta table
empty_df.write.format("delta").mode("overwrite").saveAsTable("customers_silver")

In [0]:
SINK_TABLE = "customers_silver"
def etl_and_merge_to_silver(df, batch_id):
    if not df.isEmpty():
        sink_table_delta = DeltaTable.forName(spark, SINK_TABLE)
        df_transactions_sink = sink_table_delta.toDF()
        logger.info(f'sink delta table has now {df_transactions_sink.count()} records')
        last_version_records = cdf_id_last_version(df)
        source_transformed = silver_etl(last_version_records)
        logger.info(f'batch_id: {batch_id} with source table of {source_transformed.count()} records')
        (sink_table_delta.alias('target')
            .merge(source_transformed.alias('source'), 
                    "source.id = target.id")
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )
        sink_table_delta = DeltaTable.forName(spark, SINK_TABLE)
        df_transactions_sink = sink_table_delta.toDF()
        logger.info(f'sink delta table has now {df_transactions_sink.count()} records')
    else:
        logger.info(f'batch_id: {batch_id} No records to load ')

In [0]:
# read cdf stream and merge bronze
SOURCE_TABLE = "customers_bronze"
checkpoint_path = "dbfs:/mnt/databricks/_checkpoint/silver_customers_cdf"

(spark.readStream
  .format("delta")
  .option("readChangeFeed", "true")
  .table(SOURCE_TABLE)
  .writeStream
  .foreachBatch(etl_and_merge_to_silver)
  .option("checkpointLocation", checkpoint_path)
  .trigger(availableNow=True)
  .start()
  )

<pyspark.sql.streaming.query.StreamingQuery at 0x7f0f2cf61f30>

In [0]:
%sql
select * from customers_silver
order by id

id,first_name,last_name,full_name,email,city,_rescued_data
1,Sarah,Spider,Sarah Spider,susanstark@example.org,Port Lauren,
10,Ashley,Young,Ashley Young,jason26@example.net,Port Robert,
11,Michael,Smith,Michael Smith,andrew67@example.net,Guerrachester,
12,Kenneth,Taylor,Kenneth Taylor,anthony97@example.net,Jamesmouth,
13,Derrick,Gordon,Derrick Gordon,amanda84@example.org,Timothymouth,
14,Tina,Hopkins,Tina Hopkins,kevin08@example.net,Acostaside,
15,William,Jones,William Jones,hsmith@example.org,Dawnmouth,
16,Tony,Martinez,Tony Martinez,joshua70@example.net,New Dwaynemouth,
17,Nichole,Clark,Nichole Clark,sbarnett@example.org,Port Shawn,
18,Matthew,Shaw,Matthew Shaw,qjohnson@example.org,West William,


In [0]:
# move logs to a permanent location
source_path = f'file:{log_file}'
destination_path = f"dbfs:/mnt/databricks/logs/{log_file.split('/')[-1]}"

# Check if the destination file exists
if dbutils.fs.cp(source_path, destination_path, True):
    dbutils.fs.rm(destination_path)
    dbutils.fs.cp(source_path, destination_path)
else:
    dbutils.fs.cp(source_path, destination_path)

In [0]:
%sql
drop table if exists customers_bronze;
drop table if exists customers_silver;