In [4]:
parquet_file_path = "abfss://temp@fedsaentanalytics.dfs.core.windows.net/fe_ingest_framework/276d04d1-05b6-42b7-ba78-aef87b4f0c9f/data/smoke_test_smoke_test_oracle_oracle_append/20231204082421_smoke_test_smoke_test_oracle_oracle_append_051266eeb0d44e0482a0dd68061037a4.parquet"
dest_schema_name = "oracle"
dest_table_name = "oracle_append"
uuid = "dec2a899-fbdc-4e2b-8c86-ff3486f014a2"
table_container = "bronze"
del_filter = None
dest_db_name = "smoke_test"
mode = "append"
init_flag = True
partition_by = None
view_col_config = None
table_path = "smoke_test/smoke_test/oracle/oracle_append/"
primary_key = None
az_id = "smoke_test_smoke_test_oracle_oracle_append"

In [6]:
%run nb_framework_common

In [7]:
###################################################################################################
# description: check settings passed in and load source data into spark dataframe
###################################################################################################

from pyspark.sql import functions as f
import pandas as pd
import configparser
import os
from delta.tables import *
import json
from datetime import datetime

try:
    # want change data feed enabled on all delta tables and dates < that can be stored in spark to be passed through
    spark.conf.set("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")
    spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
    spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
    spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
    error_msg = []

    keys = []
    # split comman delimited primary key config into a list of each col in primary key
    if primary_key:
        keys = primary_key.split(",")

    # make sure dest_table_name in config doesnt have any periods in it... convert them to _
    dest_table_name = dest_table_name.replace(".","_").replace("/","")

    # set table_name with schema and table_name ... this is used in cataloging with the view creation for serverless
    if dest_schema_name:
        table_name = f"{dest_schema_name}_{dest_table_name}"
    else:
        table_name = dest_table_name

    print(dest_table_name)

    # create path of destination delta table in adls
    delta_table_path = os.path.join(f"abfss://{table_container}@{storage_account}.dfs.core.windows.net", table_path) 

    # do some checks to make sure mode and any needed subsequent settings are being passed in
    if (mode in ["partition+overwrite"] and not partition_by):
        raise Exception("Must pass in partition_by if using partition+overwrite.")

    if (mode in ["del+insert"] and not del_filter):
        raise Exception("Must pass in del_filter if using del+insert.")

    if not mode in ["append", "upsert", "truncate+fill","del+insert","partition+overwrite","merge"]:
        raise Exception(f"Mode value '{mode}' passed in is invalid.  Expected values are append, upsert, del+upsert, or merge")

    if (len(keys)<=0) and mode in ["upsert", "merge"]:
        raise Exception("Table must have a primary_key passed in for upsert,del+upsert or merge. If muliple columns delimit by comma.")

    if not table_name:
        raise Exception("dest_table_name must be passed in.")

    if not table_path:
        raise Exception("table_path must be passed in.")


    print(f"init_flag is {init_flag}")

    # make sure passed in parquet file exists (in landing zone)
    if not mssparkutils.fs.exists(parquet_file_path):
        raise Exception(f"Source file path {parquet_file_path} does not exist. Cannot refresh table as there is no source file to load from.")
    else:
        # load parquet file into spark dataframe
        source_df = spark.read.load(parquet_file_path, format='parquet')

        # create temp view overtop of parquet file
        source_df.createOrReplaceTempView(f"{table_name}_source")
        #source_df.withColumn('_watermark_dt_', f.current_timestamp())
        print(f"Source data rows: {source_df.count()}")

        # TO DO: if primary key check here the source_df doesn't have duplicate rows for that primary key.. throw error if it does

    # call common get_schema_info which produces the 3 variables needed in further logic... see common function for further details
    tracked_cols, partitionBy, typed_cols = get_schema_info(source_df=source_df, view_col_def=view_col_config, partition_by=partition_by)
    display(tracked_cols) 
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [8]:
##############################################################
# description: initialize table if init_flag is set
##############################################################
try:
    # if no error messages at this point
    if len(error_msg)<=0:
        print(f"{delta_table_path},{init_flag},{source_df.rdd.isEmpty()},{len(source_df.columns)},{dest_db_name},{table_name}")

        # check if table should be initialized based on the init flag and a couple other things
        if len(delta_table_path)>0 and init_flag and len(table_name)>0 and len(source_df.columns)>0:
            # call common function initialize_delta table... which will create adls delta table and serverless view overtop of it
            (initialize_delta_table(source_df=source_df
                , delta_table_path=delta_table_path
                , parquet_file_path=parquet_file_path
                , dest_db_name=table_container
                , dest_schema_name=dest_db_name
                , dest_view_name=table_name
                , tracked_cols=tracked_cols
                , partitionBy=partitionBy
                , typed_cols=typed_cols))

            # update config table with time of which it was pulled from source system
            if (len(error_msg)<=0):
                spark.sql(f"""
                    INSERT INTO fe_config_ingest_log(uuid, az_id, op, ts, msg)
                        select uuid() uuid, '{az_id}' az_id, 'dt_last_source_init' op, (select max(ingest_fw_load_dt) ts from {table_name}_source) ts, null msg
                """)
        else:
            print(f"Skipping initialization... as one of the following rules are not set delta_table_path={delta_table_path},init_flag={init_flag},source_df.rdd.isEmpty()={source_df.rdd.isEmpty()},len(empDF.columns)={len(source_df.columns)},dest_db_name={dest_db_name},table_name={table_name}") 
            if init_flag:
                raise Exception("Was set to initialize and something was not correct... see logs for more details.")
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [9]:
##############################################################
# description: incremenally load table if init_flag is NOT set
##############################################################
try:
    # if no errors up until this point
    if len(error_msg)<=0:
        # make sure you have data and it should be incrementally loaded
        if not source_df.rdd.isEmpty() and not init_flag:
            print(f"mode={mode},del_filter={del_filter}")

            # call common incremental_delta_table function ... which will load the data from source with which mode is in settings
            (incremental_delta_table(mode=mode
                , source_df=source_df
                , delta_table_path=delta_table_path
                , commit_meta_data=parquet_file_path
                , tracked_cols=tracked_cols
                , keys=keys
                , partitionBy=partitionBy
                , del_filter=del_filter))                    
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [10]:
# TO DO: add logic to do optimization and vacuum in another code cell... to keep delta tables lean 

#display(spark.sql(f'DESCRIBE HISTORY delta.`{delta_table_path}`'))

# set exit value to error message so can be handled in calling notebook (nb_ingestion_by_config)
mssparkutils.notebook.exit(json.dumps(error_msg))