In [2]:
config_name="smoke_test"
storage_account="fedsaentanalytics"
tag=""
debug=False

In [3]:
%run nb_framework_common

In [4]:
##################################################################################################################
# description: get transform config data from config tables to determing 
#               order of operations based on config_name and tag
##################################################################################################################

import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import com.microsoft.spark.sqlanalytics
from com.microsoft.spark.sqlanalytics.Constants import Constants
from pyspark.sql.functions import col
import struct
import pyodbc
import networkx as nx 
import json
import textwrap

try:
    # error_msg empty list that if any error encountered in notebook will be filled with the error message and then returned to calling as the exit value
    error_msg = []

    # query for parent child relationship between config objects
    config_query_refs = f"""SELECT *
         FROM vw_fe_config_transform_ref
         where config_name='{config_name}' and is_disabled=0
    """

    # query for config of config_name being passed in
    config_query = f"""select *
                        , case when dest_delta_table_path is not null then 1 else 0 end delta_transform_flag
                        , case when dest_dw_db is not null then 1 else 0 end dw_transform_flag
                    from vw_fe_config_transform t
                    where config_name='{config_name}' and is_disabled=0
        
                """
    
    # if tag passed in then add it to both as an additional filter
    if tag:
        config_query+=f" and tag='{tag}'"
        config_query_refs+=f" and tag='{tag}'"

    # run parent child query and store as pandas df
    df_dag = spark.sql(config_query_refs).toPandas()
    #display(df_dag)

    # generate graph based on parent child
    G = nx.from_pandas_edgelist(df_dag,
                                source='parent_id',
                                target='az_id',
                                create_using=nx.DiGraph())

    i=0
    # generate order of relations
    groupings = list(nx.topological_generations(G))

    # loop through the groupings and create column called step giving ordinal value
    for group in groupings:
        for step in group:
            #print(f"{i} - {step}")
            df_dag.loc[df_dag['az_id'] == step, 'step'] = i
        i+=1

    # create neeeded refs json column that will be passed later to next notebook and create temp view
    df_refs = spark.createDataFrame((df_dag.groupby('az_id').apply(lambda x: x.to_json(orient='records'))).reset_index(name='refs'))
    df_refs.createOrReplaceTempView("refs")
    #display(df_refs)

    # convert dag steps pandas df back to spark df and create temp view
    df_spark_dag = spark.createDataFrame(df_dag)
    df_spark_dag.createOrReplaceTempView("dag")
    #display(df_spark_dag)

    # get config data and create temp view
    df_transformations = spark.sql(config_query)
    df_transformations.createOrReplaceTempView("transformations")

    # combine the 3 dataframes above into final view
    df = spark.sql("""
        select dense_rank() over (order by coalesce(d.step,1)) step, t.*, r.refs
        from transformations t
            left join refs r on r.az_id=t.az_id
            left join (select az_id, max(step) step from dag group by az_id) d on d.az_id=t.az_id
    """)
    df.createOrReplaceTempView("final")
    
    # Show contents of the dataframe
    display(df)
except Exception as e:
    print(e)
    error_msg.append( { "error": f"{config_name}: {e}" } )


In [82]:
##################################################################################################################
# description: returns two lists of return values/error_msg and string az_id
#           calls delta to delta transform or dw transform depending on config row values in
#           delta_transform_flag and dw_transform_flag defined in steps above
# parms:    
#           df_row = pandas dataframe row containing "final" config        
##################################################################################################################
def run_load(df_row):
    try:
        delta_rv=[]
        dw_rv=[]
        #print(df_row)

        if (df_row['delta_transform_flag']==1):
            # TO DO: add one more watermark columns in log that are equivelent to max(ingest_fw_load_dt) of table.. 
            # idea would be to skip having to run again if source tables haven't changed 
            # so you can check the last time data was pulled from source and check against the max(ingest_fw_load_dt) of this source pull
            # then you can skip load if nothing has changed here
            delta_rv = run_delta_to_delta(df_row)

        if (df_row['dw_transform_flag']==1):
            # TO DO: add one more watermark columns in log that are equivelent to max(ingest_fw_load_dt) of table..
            # idea would be to skip having to run again if source tables haven't changed 
            # so you can check the last time data was pulled from source and check against the max(ingest_fw_load_dt) of this source pull
            # then you can skip load if nothing has changed here
            dw_rv = run_dw_to_dw_proc(df_row)
    except Exception as e:
        print(e)
        error_msg.append( { "error": f"{df_row['az_id']}: {e}" } )

    return delta_rv,dw_rv,df_row['az_id']

##################################################################################################################
# description: calls notebook to run dedicated sql / dw tsql steps
#           returns error message if there was on in a list
# parms:    
#           df_row = pandas dataframe row containing "final" config        
##################################################################################################################
def run_dw_to_dw_proc(df_row):
    init_flag = (bool(df_row.isnull().loc['dt_dw_last_transform_init']))
    print(f"DW {df_row['az_id']}: Running dw to dw for {df_row['az_id']}")            
    print(f"DW {df_row['az_id']}: Will run init sql?? {init_flag}")

    rv = []
    if init_flag:
        transform_sql = df_row['sql_transform_init']
    else:
        transform_sql = df_row['sql_transform_inc'] or df_row['sql_transform_init']

    if debug:
        print(textwrap.dedent(textwrap.dedent(f'''
        Params for /fe_transform_framework/nb_dw_proc_transform

        sql = "{str(transform_sql or '')}"
        dest_dw_db = "{str(df_row['dest_dw_db'] or '')}"
        dest_dw_endpoint = "{str(df_row['dest_dw_endpoint'] or '')}"
        ''')))
    else:
        rv = json.loads(mssparkutils.notebook.run(path="/fe_transform_framework/nb_dw_proc_transform"
                , timeout_seconds=1200
                , arguments={ "sql" : transform_sql,
                                    "dest_dw_db" : df_row['dest_dw_db'],
                                    "dest_dw_endpoint" : df_row['dest_dw_endpoint']
            }))      

    # check return value
    print(rv)
    if len(rv)>0:
        raise Exception(f"DW {df_row['az_id']}: Return value of {rv}... error detected. Will fail. Check logs.")
    else:
        if not debug:
            if init_flag:
                spark.sql(f"""
                        INSERT INTO fe_config_transform_log(uuid, az_id, op, ts, msg)
                            select uuid() uuid, '{df_row['az_id']}', 'dt_dw_last_transform_init' op, current_timestamp() ts, null msg
                    """)
            else:
                spark.sql(f"""
                        INSERT INTO fe_config_transform_log(uuid, az_id, op, ts, msg)
                            select uuid() uuid, '{df_row['az_id']}', 'dt_dw_last_transform_inc' op, current_timestamp() ts, null msg
                    """) 

        print(f"DW {df_row['az_id']}: Completed notebook for {df_row['az_id']}... exit value is SUCCESS")

    return(rv)

##################################################################################################################
# description: calls notebook to run delta spark sql steps
#           returns error message if there was on in a list
# parms:    
#           df_row = pandas dataframe row containing "final" config  
##################################################################################################################
def run_delta_to_delta(df_row):

    init_flag = (bool(df_row.isnull().loc['dt_last_transform_init']))
    print(f"Delta {df_row['az_id']}: Running delta to delta for {df_row['az_id']}")            
    print(f"Delta {df_row['az_id']}: Will init table?? {init_flag}")

    rv = []
    if init_flag:
        transform_sql = df_row['sql_transform_init']
    else:
        transform_sql = df_row['sql_transform_inc'] or df_row['sql_transform_init']

    if debug:
        print(textwrap.dedent(textwrap.dedent(f'''
        Params for /fe_transform_framework/nb_delta_to_delta_transform

        mode = "{str(df_row['mode'] or '')}"
        del_filter = ""
        sql = "{str(transform_sql or '')}"
        dest_schema_name = "{str(df_row['dest_delta_schema_name'] or '')}"
        dest_table_name = "{str(df_row['dest_delta_table_name'] or '')}"
        table_container = "{str(df_row['dest_delta_container'] or '')}"
        table_path = "{str(df_row['dest_delta_table_path'] or '')}"
        primary_key = "{str(df_row['dest_delta_table_primary_key'] or '')}"
        partition_by = "{str(df_row['dest_delta_table_partition_by'] or '')}"
        init_flag = {str(init_flag)}
        view_col_config = "{str(df_row['src_col_def'] or '')}"
        storage_account = "{str(storage_account or '')}"
        refs = '{str(df_row['refs'] or '')}'
        ''')))
    else:
        rv = json.loads(mssparkutils.notebook.run(path="/fe_transform_framework/nb_delta_to_delta_transform"
                , timeout_seconds=1200
                , arguments={"mode" : df_row['mode'],
                                    "del_filter" : "",
                                    "sql" : transform_sql,
                                    "dest_schema_name" : df_row['dest_delta_schema_name'],
                                    "dest_table_name" : df_row['dest_delta_table_name'],
                                    "table_container" : df_row['dest_delta_container'],
                                    "table_path" : df_row['dest_delta_table_path'],
                                    "primary_key" : df_row['dest_delta_table_primary_key'],
                                    "partition_by" : df_row['dest_delta_table_partition_by'],
                                    "init_flag" : init_flag,
                                    "view_col_config" : df_row['src_col_def'],
                                    "storage_account" : storage_account,
                                    "refs" : df_row['refs']
            }))   

    # check return value
    print(rv)
    if len(rv)>0:
        raise Exception(f"Delta {df_row['az_id']}: Return value of {rv}... error detected. Will fail. Check logs.")
    else:
        if not debug:
            if init_flag:
                spark.sql(f"""
                        INSERT INTO fe_config_transform_log(uuid, az_id, op, ts, msg)
                            select uuid() uuid, '{df_row['az_id']}', 'dt_last_transform_init' op, current_timestamp() ts, null msg
                    """)
            else:
                spark.sql(f"""
                        INSERT INTO fe_config_transform_log(uuid, az_id, op, ts, msg)
                            select uuid() uuid, '{df_row['az_id']}', 'dt_last_transform_inc' op, current_timestamp() ts, null msg
                    """) 

        print(f"Delta {df_row['az_id']}: Completed notebook for {df_row['az_id']}... exit value is SUCCESS")

    return(rv)

In [83]:
try:
    # if no error msg happened prior
    if len(error_msg)<=0:
        # create master error msg lists for each type of transform
        delta_return_val = []
        dw_return_val = []

        # convert final config with steps to pandas df
        dfp = df.toPandas()

        # get max steps
        if len(dfp)>0:
            max_steps = dfp['step'].max()
        else:
            max_steps = 0

        # loop through each possible step
        for step in range(1,max_steps+1):

            if len(error_msg)<=0:
                print(f"Running {config_name} load... step {step} of {max_steps}")

                # utilize thread pool to parallelize workload through each step
                with ThreadPoolExecutor(max_workers=1) as e:
                    # for current step get the batch of config items
                    df_batch = dfp.loc[dfp['step'] == step]
                    threads = []

                    # submit each item in step/batch to a thread
                    for i, row in df_batch.iterrows():     
                        threads.append(e.submit(run_load, row))

                    # run the threads/batch
                    for thread in threads:
                        delta_return_val, dw_return_val, az_id = thread.result() 
                        
                        # check return value of delta to delta and append error_msg if one was encountered
                        # so next loop/step will not continue
                        if len(delta_return_val)>0:
                            print(f"{az_id}: Error returned for az_id {az_id}... ({delta_return_val})")
                            spark.sql(f"""
                                INSERT INTO fe_config_transform_log(uuid, az_id, op, ts, msg)
                                    select uuid() uuid, '{az_id}', 'error' op, current_timestamp() ts, '{delta_return_val}' msg
                                """)
                            error_msg.append( { "error": f"{az_id}: {delta_return_val}" } )
                            
                        # check return value of delta to delta and append error_msg if one was encountered
                        # so next loop/step will not continue
                        if len(dw_return_val)>0:
                            print(f"{az_id}: Error returned for az_id {az_id}... ({dw_return_val})")
                            spark.sql(f"""
                                INSERT INTO fe_config_transform_log(uuid, az_id, op, ts, msg)
                                    select uuid() uuid, '{az_id}', 'error' op, current_timestamp() ts, '{dw_return_val}' msg
                                """)
                            error_msg.append( { "error": f"{az_id}: {dw_return_val}" } )
            else:
                raise Exception(f"Error... Exiting remaining steps.")

        print(f"Load {config_name} complete...")
except Exception as e:
    print(e)
    raise Exception(f"Error detected. Will fail. Check logs.")

In [84]:
if len(error_msg)>0:
    mssparkutils.notebook.exit(error_msg)
else:
    print("SUCCESS")