In [44]:
sql = "select * from devsqlfarm_ESRIVEG01D_sde_CORRIDOR_EVW"
storage_account = "fedsaentanalytics"
dest_table_name = "sde_CORRIDOR_EVW"
dest_schema_name = "esriveg01d"
table_container = "silver"
mode = "truncate+fill"
init_flag = True
partition_by = None
table_path = "devsqlfarm/ESRIVEG01D/sde/CORRIDOR_EVW"
primary_key = "GlobalId"
view_col_config = '[{"col_name":"Shape", "sql_type":"varchar(max)"},{"col_name":"GlobalID", "sql_type":"varchar(60)"}]'
refs = '''[{"parent_id": "devsqlfarm_ESRIVEG01D_sde_CORRIDOR_EVW"
,"az_id": "devsqlfarm_silver_esriveg01d_sde_CORRIDOR_EVW"
,"config_name": "devsqlfarm"	
,"config_name": "vegman"	
,"last_delta_version": null	
,"dt_last_load": null	
,"dt_last_parent_load": null	
,"parent_table_container": "bronze"	
,"parent_table_path": "devsqlfarm/ESRIVEG01D/sde/CORRIDOR_EVW/"}]'''

In [46]:
%run nb_framework_common

In [None]:
####################################################################################################################
# description: takes a transform config row and generates/returns the needed abfss path for querying the delta table
#   parms:  row is json refs variable passed in containing each reference in query
####################################################################################################################
def generate_delta_abfss_path(row):
    d_path = ""

    if row['parent_full_delta_path']:
        d_path =  f"delta.`{row['parent_full_delta_path']}`"
    else:
        d_path = f"delta.`abfss://{row['parent_table_container']}@{storage_account}.dfs.core.windows.net/{row['parent_table_path']}`"

    return d_path

In [47]:
####################################################################################################################
# description: create temp views for all referenced az_ids in config settings table
##################################################################################################################

from pyspark.sql.functions import *
import pandas as pd
import configparser
import os
from delta.tables import *
import json
from datetime import datetime
import pyodbc
import struct

try:
    error_msg=[]
    # do refs temp tables based on json passed in
    js_refs = json.loads(refs)
    df_refs = pd.json_normalize(js_refs)
    # create delta azure path for each ref'd table/object
    df_refs['parent_delta_abfss'] = df_refs.apply(lambda row: generate_delta_abfss_path(row), axis=1)

    #display(df_refs)

    # create temp views for sql on every object referenced 
    for index, row in df_refs.iterrows():
        # LOGIC FOR USING CDC.. NOT TESTED OR FULLY IMPLEMENTED
        # if no last delta version stored in delta tranform log tables... 
        if not bool(row.isnull().loc['last_delta_version']):
            # if using cdf call fn_table in common which will return each delta path as temp view for az_id
            # in this case it would get the correct version of that delta table by using last_delta_version
            # and return a {az_id}_changes dataframe ... and then below will create the {az_id} frame based on change columns
            fn_table(row['parent_delta_abfss'], f"{row['parent_id']}_changes", [row['last_delta_version']])

            # aggregate changes in the change data feed output (ex. if multiple updates)
            agg_changes = spark.sql(f"""select * 
                            from    (
                                select *, row_number() over (partition by {primary_key} order by _commit_version desc) as _chng_rnk
                                from {row['parent_id']}_changes
                                where _change_type in ('update_postimage','insert','delete')
                                ) x
                            where _chng_rnk=1
                        """)

            # remove change cols and create new temp view
            agg_changes=agg_changes.drop("_change_type", "_commit_version", "_commit_timestamp", "_chng_rnk")
            spark.catalog.dropTempView(row['parent_id'])
            agg_changes.printSchema()
            agg_changes.createOrReplaceTempView(row['parent_id'])
            
        else:
            # call fn_table in common which will return each delta path as temp view for az_id
            fn_table(row['parent_delta_abfss'], row['parent_id'])
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [48]:
####################################################################################################################
# description: do checks of parameters passed in and run the sql on temp views to produce the transformed dataframe
####################################################################################################################

try:
    if len(error_msg)<=0:
        # want change data feed enabled on all delta tables
        spark.conf.set("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")
        spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
        spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
        spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
        dest_db_name = dest_schema_name
        
        # create list of primary key columns
        keys = []
        if primary_key:
            keys = primary_key.split(",")

        # set destination path of transformed delta table
        table_name = dest_table_name
        delta_table_path = os.path.join(f"abfss://{table_container}@{storage_account}.dfs.core.windows.net", table_path) 

        # check mode settings
        if (mode in ["partition+overwrite"] and not partition_by):
            raise Exception("Must pass in partition_by if using partition+overwrite.")

        if (mode in ["del+insert"] and not del_filter):
            raise Exception("Must pass in del_filter if using del+insert.")

        if not mode in ["append", "upsert", "truncate+fill","del+insert","partition+overwrite", "merge"]:
            raise Exception(f"Mode value '{mode}' passed in is invalid.  Expected values are append, upsert, del+upsert, or merge")

        if (len(keys)<=0) and mode in ["upsert", "partition+merge"]:
            raise Exception("Table must have a primary_key passed in for upsert,del+upsert or merge. If muliple columns delimit by comma.")

        # make sure there's a table name and table path set
        if not table_name:
            raise Exception("dest_table_name must be passed in.")

        if not table_path:
            raise Exception("table_path must be passed in.")


        print(f"init_flag is {init_flag}")
        #print(sql)
        source_df = spark.sql(sql)
        print(f"Source data rows: {source_df.count()}")

        # based on view col config set data types for the being used in creation of serverless sql pool view
        tracked_cols, partitionBy, typed_cols = get_schema_info(source_df=source_df, view_col_def=view_col_config, partition_by=partition_by)
        #display(tracked_cols)

        # TO DO: if primary key check here the source_df doesn't have duplicate rows for that primary key.. throw error if it does
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [49]:
####################################################################################################################
# description: do initialization logic for delta table from transformed dataframe if init_flag was set
####################################################################################################################

try:
    if len(error_msg)<=0:
        print(f"{delta_table_path},{init_flag},{source_df.rdd.isEmpty()},{len(source_df.columns)},{dest_db_name},{table_name}")

        if len(delta_table_path)>0 and init_flag and len(table_name)>0 and len(source_df.columns)>0:
            (initialize_delta_table(source_df=source_df
                , delta_table_path=delta_table_path
                , parquet_file_path=sql
                , dest_db_name=table_container
                , dest_schema_name=dest_db_name
                , dest_view_name=table_name
                , tracked_cols=tracked_cols
                , partitionBy=partitionBy
                , typed_cols=typed_cols))
        else:
            print(f"Skipping initialization... as one of the following rules are not set delta_table_path={delta_table_path},init_flag={init_flag},source_df.rdd.isEmpty()={source_df.rdd.isEmpty()},len(empDF.columns)={len(source_df.columns)},dest_db_name={dest_db_name},table_name={table_name}") 
            if init_flag:
                raise Exception("Was set to initialize and something was not correct... see logs for more details.")
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [50]:
####################################################################################################################
# description: do incremental logic for delta table from transformed dataframe if init_flag was NOT set
####################################################################################################################

try:
    if len(error_msg)<=0:
        if not source_df.rdd.isEmpty() and not init_flag:
            print(f"mode={mode},del_filter={del_filter}")

            (incremental_delta_table(mode=mode
                , source_df=source_df
                , delta_table_path=delta_table_path
                , commit_meta_data=sql
                , tracked_cols=tracked_cols
                , keys=keys
                , partitionBy=partitionBy
                , del_filter=""))                    
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [51]:
# TO DO: if primary key ... add code cell aboave that will do primary key check doing sql group by pk... 
# TO DO: add logic to do optimization and vacuum in another code cell... to keep delta tables lean 


#spark.sql(f"select count(*) from {dest_db_name}.{table_name}").show()
#display(spark.sql(f"DESCRIBE HISTORY delta.`abfss://silver@fedsaentanalytics.dfs.core.windows.net/memsql/data_lake/sapshr/DFKKOP/`"))
#vers = spark.conf.get("spark.databricks.delta.lastCommitVersionInSession")
#print(str(vers))
mssparkutils.notebook.exit(json.dumps(error_msg))