In [1]:
run_id = ""
parquet_file_path = "abfss://temp@fedsaentanalytics.dfs.core.windows.net/fe_ingest_framework/vgms/SSPLCycle01D/dbo/processed/DEV_VGMS_DBO_AUDITTRACKING/20231003084136_DEV_VGMS_DBO_AUDITTRACKING_init.parquet"
dest_schema_name = "vgms"
dest_table_name = "dbo_AUDITTRACKING"

In [None]:
%run nb_framework_common

In [65]:
from pyspark.sql.functions import *
import pandas as pd
from datetime import datetime
import com.microsoft.spark.sqlanalytics
from com.microsoft.spark.sqlanalytics.Constants import Constants
import json

try:
    # initialize error message list
    error_msg = []

    # set spark config so dates < that can be stored in spark to be passed through
    spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
    spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
    spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")

    # if no dest_table_name was passed in then error
    if not dest_table_name:
        raise Exception("dest_table_name must be passed in.")

    # set staging table to what was passed in via config... if no schema passed in then default to dbo
    if dest_schema_name:
        stg_table_name = f"{dest_schema_name}.{dest_table_name}"
    else:
        stg_table_name = f"dbo.{dest_table_name}"

    # check if source parquet file exists
    if not mssparkutils.fs.exists(parquet_file_path):
        raise Exception(f"Source file path {parquet_file_path} does not exist. Cannot refresh table as there is no source file to load from.")
    else:
        # load the source data into a spark dataframe
        source_df = spark.read.load(parquet_file_path, format='parquet')

        # add the 3 columns that are needed to the spark dataframe
        # fe_fw_run_id is the literal run_id of the ingestion framework
        source_df = source_df.withColumn("fe_fw_run_id", lit(run_id))
        # fe_fw_dt_inserted is the current time
        source_df = source_df.withColumn("fe_fw_dt_inserted", current_timestamp())
        # fe_fw_dt_processed is None/NULL and should be used to determine which rows need processed in the staging table
        source_df = source_df.withColumn("fe_fw_dt_processed", lit(None).cast("timestamp"))
        print(f"Source data rows: {source_df.count()}")

except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [66]:

try:
    # need to rename function to get_synapse_pool_conn
    cnnstr, token = get_serverless_synapse_conn( dw_db_name, dw_server_name )

    # TO DO: add delete here for runid for staging table to ensure no duplicates
    # meaning if run_id is reprocessed... delete any rows with that run_id in it before reinserting
    
    # create schema in dedicated sql pool if it doesnt exist
    if dest_schema_name:
        # need to rename function to run_sql_pool_cmd
        run_serverless_sql(f"""
            IF (NOT EXISTS (SELECT * FROM sys.schemas WHERE name = '{dest_schema_name}')) 
            BEGIN
                EXEC ('CREATE SCHEMA [{dest_schema_name}]')
            END
        """, cnnstr, token)

    print(f"Writing to {dw_db_name}.{stg_table_name}")

    #stage parquet data into passed in staging table
    if not source_df.rdd.isEmpty():
        (source_df.write
            .option(Constants.SERVER, "fe-d-syn-enterpriseanalytics.sql.azuresynapse.net")
            .option(Constants.TEMP_FOLDER, "abfss://temp@fedsaentanalytics.dfs.core.windows.net/notebooks_data/")
            .mode("Append")
            .synapsesql(f"{dw_db_name}.{stg_table_name}")
        )

except Exception as e:
    print(e)
    error_msg.append( { "error": f"{dest_table_name}: {e}" } )

In [None]:
# return error message to calling notebook (nb_ingestion_by_config)
mssparkutils.notebook.exit(json.dumps(error_msg))