In [0]:
from pyspark.sql import functions as f
from delta.tables import *
import re
import json
import datetime
import time

In [0]:
raw_folderpath = dbutils.widgets.get("raw_folderpath")
raw_filename = dbutils.widgets.get("raw_filename")
primary_key_cols = dbutils.widgets.get("primary_key_cols")
partition_cols = dbutils.widgets.get("partition_cols")
date_partition_column = dbutils.widgets.get("date_partition_column")
file_type = dbutils.widgets.get("file_type")
storageAccountName = dbutils.widgets.get("dataLakeStorageAccountName")
sinkFolderPath = dbutils.widgets.get("sinkFolderPath")
unityCatalogName = dbutils.widgets.get("unityCatalogName")
unitySchemaName = dbutils.widgets.get("unitySchemaName")
tableName = dbutils.widgets.get("tableName").lower()

In [0]:
# convert parameter partition_cols from string type to list type
partition_cols_list = json.loads(partition_cols.replace("'",'"'))

### Set to Correct Database

In [0]:
%sql
USE CATALOG ${unityCatalogName};

USE SCHEMA ${unitySchemaName};

### Read Input Data

In [0]:
raw_folderpathWithoutContainer = raw_folderpath[4:]
stagingFolderPath = re.sub(r'(.*/v[0-9]+/).*', r'\1', raw_folderpathWithoutContainer)
version = re.search(r'\/(v\d+)\/',stagingFolderPath).group(1)

if file_type == 'json':
    raw_data = spark.read.json(f'abfss://raw@{storageAccountName}.dfs.core.windows.net/{raw_folderpathWithoutContainer}{raw_filename}')
elif file_type == 'parquet':
    raw_data = spark.read.parquet(f'abfss://raw@{storageAccountName}.dfs.core.windows.net/{raw_folderpathWithoutContainer}{raw_filename}')
else:
    raw_data = spark.read.load(f'abfss://raw@{storageAccountName}.dfs.core.windows.net/{raw_folderpathWithoutContainer}')

### Add Date Partition Columns If Necessary

In [0]:
if 'CalcYear' in partition_cols_list:
    raw_data = raw_data.withColumn('CalcYear', f.year(f.col(date_partition_column)))

if 'CalcMonth' in partition_cols_list:
    raw_data = raw_data.withColumn('CalcMonth', f.month(f.col(date_partition_column)))

if 'CalcDayOfMonth' in partition_cols_list:
    raw_data = raw_data.withColumn('CalcDayOfMonth', f.dayofmonth(f.col(date_partition_column)))

### Set Output Path

In [0]:
stagingAbfssPath = f'abfss://staging@{storageAccountName}.dfs.core.windows.net/{sinkFolderPath}'

### Determine if Delta Table Exists

In [0]:
informationSchemaQuery = spark.sql(f"""
            SELECT  table_name
            FROM    {unityCatalogName}.information_schema.tables 
            WHERE   table_name = '{tableName}' 
            AND     table_schema='{unitySchemaName}' LIMIT 1""",
        ).first()

if informationSchemaQuery == None:
    tableExists = False
else: 
    tableExists = True

### Add Filename, Timestamp, Version Columns

In [0]:
# version
raw_data = raw_data.withColumn('control_file_version',f.lit(version))

# location in raw zone
location_in_raw = raw_folderpath + raw_filename
raw_data = raw_data.withColumn('location_in_raw', f.lit(location_in_raw))

# timestamp
now = str(datetime.datetime.now())
raw_data = raw_data.withColumn('timestamp',f.to_timestamp(f.lit(now)))

### If table does **not** exist, create new delta table
- Either with or without partitions

In [0]:
if tableExists == False:
    if partition_cols == "[]":
        #output with no partition specified
        raw_data.write.format("delta") \
                .option("path", stagingAbfssPath) \
                .saveAsTable(tableName)
    else:
        # output with specified partition
        raw_data.write.format("delta") \
                .partitionBy(partition_cols_list) \
                .option("path", stagingAbfssPath) \
                .saveAsTable(tableName)

### Format primary key columns for merge statement

In [0]:
# convert parameter primary_key_cols from string type to list type
primary_key_cols_list = json.loads(primary_key_cols.replace("'",'"'))

# set initial mergeOn statement using first primary key columns
mergeOn = f'current.{primary_key_cols_list[0]} = new.{primary_key_cols_list[0]}'

# Add additional primary key columns to string with preceeding AND
for primary_key_col in primary_key_cols_list[1:]:
    mergeOn = mergeOn + f' AND current.{primary_key_col} = new.{primary_key_col}'

### If Table Exists Merge New Data Into Existing Delta Table

In [0]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")

if tableExists == True:
    deltaTablePointer = DeltaTable.forPath(spark, stagingAbfssPath)

    # Drop Null Struct/Array Columns
    array_struct_cols = [column[0] for column in raw_data.dtypes if (('struct' in column[1]) | ('array' in column[1]))]
    null_cols = [column for column in array_struct_cols if raw_data.select(f.col(column).isNull().cast("int").alias(column)).agg({column: "sum"}).collect()[0][0] == 0]
    raw_data = raw_data.drop(*null_cols)

    # try merge 3 times 
    try:
        deltaTablePointer.alias("current").merge(
            raw_data.alias("new"), f"{mergeOn}" ) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()
    except:
        # wait 30 seconds
        time.sleep(30)
        try:
            deltaTablePointer.alias("current").merge(
            raw_data.alias("new"), f"{mergeOn}" ) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()
        except:
            # wait 30 seconds
            time.sleep(30)
            deltaTablePointer.alias("current").merge(
            raw_data.alias("new"), f"{mergeOn}" ) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()