In [25]:
config_name="smoke_test"
tag=""
temp_container = "temp"
temp_root = "fe_ingest_framework"
run_id = "a95f660f-bf19-4037-9186-194838cb7633"
debug = False

In [26]:
%run nb_framework_common

In [27]:
##############################################
# description: get config settings
##############################################

import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import com.microsoft.spark.sqlanalytics
from com.microsoft.spark.sqlanalytics.Constants import Constants
from pyspark.sql.functions import col,lit
import struct
import pyodbc
import json
import uuid
import textwrap

try:
    # initialize error_msg list to empty
    error_msg = []

    # query to be executed to get config settings back from delta table view
    config_query = f"""SELECT *
         FROM vw_fe_config_ingest
         where config_name='{config_name}' and is_disabled=0
    """

    # add tag to end of query if tag passed in to filter down more
    if tag:
        config_query+=f" and tag='{tag}'"

    print(config_query)

    # run config query and shove in spark dataframe
    df = spark.sql(config_query)

    # Show contents of the dataframe
    display(df)

    # create temp view over dataframe
    df.createOrReplaceTempView("tmp_config")
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{config_name}: {e}" } )


In [28]:
##############################################
# description: update delta config parts table
##############################################

# build path where partitioned rows would exist from get_parts pipeline
run_id_folder_path = f"abfss://{temp_container}@{storage_account}.dfs.core.windows.net/{temp_root}/{run_id}/*_parts_SUCCESS.json"
parts_df = spark.read.option("multiline","true").json(run_id_folder_path)

# create view overtop of that json
parts_df.createOrReplaceTempView("tmp_parts_df")

# if not in debug mode then insert into fe_config_ingest from the json file in temp run_id folder of ingestion framework
if not debug:
    spark.sql("""
    INSERT INTO fe_config_ingest_part (uuid, az_id, part_filter, is_disabled)
        select part_uuid, az_id, part_filter, 0 is_disabled from tmp_parts_df s where not exists (select * from fe_config_ingest_part t where t.az_id = s.az_id and t.part_filter = s.part_filter)
    """)


    spark.sql("""
        INSERT INTO fe_config_ingest_part_log(uuid, az_id, part_filter, op, ts, msg)
            select uuid() uuid, az_id, part_filter, 'dt_part_created' op, current_timestamp() ts, null msg from tmp_parts_df s where not exists (select * from fe_config_ingest_part_log t where t.az_id = s.az_id and t.part_filter = s.part_filter)
                union all 
            select uuid() uuid, az_id, part_filter, 'dt_part_last_seen' op, load_dt ts, null msg  from tmp_parts_df s where not exists (select * from fe_config_ingest_part_log t where t.az_id = s.az_id and t.part_filter = s.part_filter and t.op='dt_part_last_seen' and t.ts=s.load_dt)
    """)

In [29]:
################################################
# description: create misc functions to do load of the config
################################################

# main function that is called in threading loop
def run_load(df_row, storage_account, dw_server_name, dw_db_name):
    print(f"{df_row['az_id']}: Running load for unprocessed files of {df_row['az_id']}")

    try:
        # initialize return value for config row
        load_rv = []
        unprocessed_folder_path = f"abfss://{temp_container}@{storage_account}.dfs.core.windows.net/{temp_root}/{run_id}/data/{df_row['az_id']}/"

        # check if data folder exists for this particular az_id in the temp landing zone location
        # note ls_files_to_data_frame is in nb_framework_common
        if mssparkutils.fs.exists(unprocessed_folder_path):
            # get listing of files within it that need processed
            df_files_to_process = ls_files_to_data_frame(unprocessed_folder_path)
        else:
            print(f"{df_row['az_id']}: No folder found {unprocessed_folder_path}")
            return([],df_row['az_id'])
        
        # if no files then just return
        if len(df_files_to_process)<=0:
            print(f"{df_row['az_id']}: No files found {unprocessed_folder_path}")
            return([],df_row['az_id'])
        else:
            # process files in order by file name... which includes a timestamp in the beginning of name
            df_files_to_process.sort_values(by='name', inplace=True)
            load_rv = process_files(df_row, df_files_to_process, storage_account, dw_server_name, dw_db_name)
    except Exception as e:
        print(e)
        error_msg.append( { "error": f"{df_row['az_id']}: {e}" } )

    return load_rv,df_row['az_id']

# function that processes each file found for config row
def process_files(df_row, df_files_to_process, storage_account, dw_server_name, dw_db_name):
    # instantiate return value list
    delta_rv, sql_rv = [],[]
    # instantiate each file list... which should contain good or bad + filename so as to know if 1 particular file was a problem
    delta_files, sql_files = [], []

    # call functions that do work in either dedicated sql pool or spark to move to next stage (either bronze or dedicated sql pool stage table)
    delta_rv, delta_files = run_parquet_to_delta(df_row, df_files_to_process, storage_account)
    sql_rv, sql_files = run_parquet_to_dw(df_row, df_files_to_process, dw_server_name, dw_db_name)

    # concat delta and sql files (good or bad) into one list
    files = delta_files+sql_files
    # make that (files) a pandas dataframe
    files_processed_df = pd.DataFrame(files)

    # get files that have been successfully processed by both sql and delta ... if required to be both
    files_processed_df = files_processed_df.groupby('path').filter(lambda x: x['good'].min() == True).groupby('path', as_index=False, sort=False)['good'].min()
    #display(files_processed_df)

    #for each file that was processed successfully .. move the parquet file out of data and move to another folder called data_processed within same parent folder
    for i, file_row in files_processed_df.iterrows():
        print(f"Fully processed {file_row['path']}")
        mssparkutils.fs.mv(file_row['path'], file_row['path'].replace("/data/","/data_processed/"), True)

    return delta_rv+sql_rv

# calls nb_parquet_to_delta
def run_parquet_to_delta(df_row, df_files_to_process, storage_account):
    # initialize return list of files that had been processed
    delta_files_processed = []
    
    # set init_flag which will be passed to notebook to determine if init logic or inc logic should be done
    # base on if there is a value in fe_config_ingest_log for dt_last_delta_init
    init_flag = (bool(df_row.isnull().loc['dt_last_delta_init']))

    # initialize return value of any errors
    rv = []

    # only run delta logic if in config settings dest_delta_table_container has been populated
    if df_row['dest_delta_table_container']:

        # for each file to process for config in this run
        for i, file_row in df_files_to_process.iterrows():
            # TO DO: check file size or get row count of file... if 0 then skip processing as it's empty

            print(f"DELTA {df_row['az_id']}: Will init table?? {init_flag}")

            rv = []
            print(f"DELTA {df_row['az_id']}: Running Delta notebook for {file_row['path']}")

            # put setting values in variables
            db = df_row['src_db_name']
            schema = df_row['src_schema_name']
            table = df_row['src_table_name']

            # overwrite workspace storage account with what's in config settings if its in config
            if df_row['dest_delta_storage_account']:
                storage_account = df_row['dest_delta_storage_account']
                
            # get the partition uuid off of the filename of the pull that occured in pipeline
            # needed so later we can update the config_log timestamps
            part_uuid = uuid.UUID(file_row['name'][-40:].replace('.parquet',''))

            # if in debug mode just print out what you'd run
            if debug:
                print(textwrap.dedent(textwrap.dedent(f'''
                Params for /fe_ingestion_framework/nb_parquet_to_delta

                mode = "{str(df_row['mode'] or '')}"
                del_filter = "{str(df_row['inc_del_filter'] or '')}"
                parquet_file_path = "{str(file_row['path'] or '')}"
                dest_schema_name = "{str(schema or '')}"
                dest_table_name = "{str(table or '')}"
                table_container = "{str(df_row['dest_delta_table_container'] or '')}"
                table_path = "{str(df_row['dest_delta_table_path'] or '')}"
                primary_key = "{str(df_row['primary_key'] or '')}"
                partition_by = "{str(df_row['partition_by'] or '')}"
                init_flag = {str(init_flag)}
                view_col_config = "{str(df_row['src_col_def'] or '')}"
                storage_account = "{str(storage_account or '')}"
                az_id = '{str(df_row['az_id'] or '')}'
                ''')))
            else:
                # run notebook with following arguments defined for parameters
                args = {"mode" : df_row['mode'],
                                        "del_filter" : df_row['inc_del_filter'],
                                        "parquet_file_path" : file_row['path'],
                                        "dest_db_name" : db,
                                        "dest_schema_name" : schema,
                                        "dest_table_name" : table,
                                        "table_container" : df_row['dest_delta_table_container'],
                                        "table_path" : df_row['dest_delta_table_path'],
                                        "primary_key" : df_row['primary_key'],
                                        "partition_by" : df_row['partition_by'],
                                        "init_flag" : init_flag,
                                        "view_col_config" : df_row['src_col_def'],
                                        "storage_account" : storage_account,
                                        "az_id" : df_row['az_id']
                }

                try:
                    rv = json.loads(mssparkutils.notebook.run(path="/fe_ingestion_framework/nb_parquet_to_delta", timeout_seconds=1200, arguments=args))
                except Exception as e:
                    rv = [{ "error": f"Delta {df_row['az_id']}: {e}" }]

            # if return value is bad (> 0 length) then set the file as being good=false so it doesn't get copied out of data folder and abort/return
            if len(rv)>0:
                delta_files_processed.append({"path" : file_row['path'], "good" : False})
                return(rv,delta_files_processed)
            else:
                # if return value good then set file as being good=true so it will get moved out of folder
                delta_files_processed.append({"path" : file_row['path'], "good" : True})

                # if not in debug then set all timestamp columns in log tables to set watermarks
                if not debug:
                    print(f"DELTA {df_row['az_id']}: Completed Delta notebook for {file_row['path']}, exit value is {rv}")

                    if init_flag:
                        init_flag = False
                        spark.sql(f"""
                            INSERT INTO fe_config_ingest_log(uuid, az_id, op, ts, msg)
                                select uuid() uuid, '{df_row['az_id']}', 'dt_last_delta_init' op, current_timestamp() ts, null msg
                        """)
                    else:
                        spark.sql(f"""
                            INSERT INTO fe_config_ingest_log(uuid, az_id, op, ts, msg)
                                select uuid() uuid, '{df_row['az_id']}', 'dt_last_delta_inc' op, current_timestamp() ts, null msg
                        """)

                    spark.sql(f"""
                        INSERT INTO fe_config_ingest_part_log(uuid, az_id, part_filter, op, ts, msg)
                            select uuid() uuid, az_id, part_filter, 'dt_part_last_load' op, current_timestamp() ts, null msg from tmp_parts_df where part_uuid = '{str(part_uuid)}'
                    """)          
    else:
        print(f"DELTA {df_row['az_id']}: skipping Delta import as no dest_delta_table_container for config.") 

    return(rv,delta_files_processed)

# function to move file data to dedicated sql pool (dw) by calling notebook nb_parquet_to_dw
def run_parquet_to_dw(df_row, df_files_to_process, dw_server_name, dw_db_name):
    # initialize return list of files that had been processed
    sql_files_processed = []
    # initialize return value of error msgs 
    rv = []
    
    # if dw_table_name populated in settings then run the nb to move to dw
    if df_row['dw_table_name']:

        # for each file to process
        for i, file_row in df_files_to_process.iterrows():
            # TO DO: check file size or get row count of file... if 0 then skip processing as it's empty
            rv = []
            print(f"DW {df_row['az_id']}: Running SQL notebook for {file_row['path']}")

            # take dw settings and put in variables
            db = df_row['src_db_name']
            schema = df_row['dw_schema_name']
            table = df_row['dw_table_name']

            # if server name in settings overwrite default which is workspace dedicated pool to it
            if df_row['dw_server_name']:
                dw_server_name = df_row['dw_server_name']

            # if db name in settings overwrite default which is workspace dedicated pool database to it
            if df_row['dw_db_name']:
                dw_db_name = df_row['dw_db_name']

            # get the partition uuid off of the filename of the pull that occured in pipeline
            # needed so later we can update the config_log timestamps
            part_uuid = uuid.UUID(file_row['name'][-40:].replace('.parquet',''))

            # if in debug mode only print out what you would have ran
            if debug:
                print(textwrap.dedent(textwrap.dedent(f'''
                Params for /fe_ingestion_framework/nb_parquet_to_delta

                parquet_file_path = "{str(file_row['path'] or '')}"
                dw_server_name = "{str(dw_server_name or '')}"
                dest_schema_name = "{str(schema or '')}"
                dest_table_name = "{str(table or '')}"
                primary_key = "{str(df_row['primary_key'] or '')}"
                storage_account = "{str(storage_account or '')}"
                run_id = '{str(run_id)}'
                ''')))

            else:
                # run notebook nb_parquet_to_dw with following arguments set
                args = {"parquet_file_path" : file_row['path'],
                                        "dest_schema_name" : schema,
                                        "dest_table_name" : table,
                                        "primary_key" : df_row['primary_key'],
                                        "storage_account" : storage_account,
                                        "run_id" : run_id,
                                        "dw_server_name": dw_server_name,
                                        "dw_db_name": dw_db_name
                }

                try:
                    rv = json.loads(mssparkutils.notebook.run(path="/fe_ingestion_framework/nb_parquet_to_dw", arguments=args))
                except Exception as e:
                    rv = [{ "error": f"DW {df_row['az_id']}: {e}" }] 

            # if return value has any error messages then set file to good=False so as to not move file out of data folder and abort/return
            if len(rv)>0:
                sql_files_processed.append({"path" : file_row['path'], "good" : False})
                return(rv,sql_files_processed)
            else:
                # if return value has no errors then mark file as being good=True so parquet file will be moved
                sql_files_processed.append({"path" : file_row['path'], "good" : True})

                # if not in debug mode then insert needed watermark timestamps in the log tables
                if not debug:
                    print(f"DW {df_row['az_id']}: Completed SQL notebook for {file_row['path']}, exit value is {rv}")

                    spark.sql(f"""
                            INSERT INTO fe_config_ingest_log(uuid, az_id, op, ts, msg)
                                select uuid() uuid, '{df_row['az_id']}', 'dt_last_dw_load' op, current_timestamp() ts, null msg
                        """)

                    spark.sql(f"""
                        INSERT INTO fe_config_ingest_part_log(uuid, az_id, part_filter, op, ts, msg)
                            select uuid() uuid, az_id, part_filter, 'dt_part_dw_last_load' op, current_timestamp() ts, null msg from tmp_parts_df where part_uuid = '{str(part_uuid)}'
                    """) 
    else:
        print(f"DW {df_row['az_id']}: skipping DW import as no dw_table_name for config.")   

    return(rv,sql_files_processed)

In [30]:
################################################
# description: set up threads and then run loads
################################################

try:
    # check for any errors up until this point
    if len(error_msg)<=0:
        
        # use threadpoolexecutor to parallelize workload... max_workers is the number of threads going at any point on the driver node of the spark session
        with ThreadPoolExecutor(max_workers=1) as e:
            threads = []
            print(f"Running {config_name} load...")

            # loop through each config row to append a new thread to the thread pool
            for i, row in df.toPandas().iterrows():   
                # add call to tnreads .. which is running run_load passing in needed parameters to process each config  
                threads.append(e.submit(run_load, row, storage_account, dw_server_name, dw_db_name))

            # start processing the threads
            for thread in threads:
                # will call the run_load here and get return value for each thread (until its done)
                return_val, az_id = thread.result() 

                print(f"{az_id}: delta_return_val={return_val},az_id={az_id}") 

                # check for any error messages 
                if len(return_val)>0: 
                    # append any error message to notebook global error_msg list
                    error_msg.append( { "error": f"{az_id}: {return_val}" } )

                    # if not in debug insert into log table the error message as well as current timestamp
                    if not debug: 
                          spark.sql(f"""
                                INSERT INTO fe_config_ingest_log(uuid, az_id, op, ts, msg)
                                    select uuid() uuid, '{az_id}', 'error' op, current_timestamp() ts, '{lit(return_val)}' msg
                            """)
                            
                    # bubble up error so as to not continue any processing if error has occured up to this point
                    raise Exception(f"{az_id}: Return value ({return_val}) is greater than 0... error detect. Will fail. Check logs.")  

        print(f"Load {config_name} complete...")
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{e}" } )

In [31]:
################################################
# description: check for any errors and fail if need be
################################################
print(error_msg)

# fail notebook if there was an error
if len(error_msg)>0:
    raise Exception(error_msg)
else:
    print("SUCCESS")