In [0]:
#extract raw data for all SQL tables in ADF or other orchestration...extract to data lake raw
#get snapshot raw data and merge to delta...upsert changes using this code

In [0]:
def extract_process_view_table(subject:str):
  """
  extract view tables and write back to a delta table
  currently the fucntion overwtites existing data. (Full Load)
  """
  import os
  from datetime import datetime 
  from pyspark.sql.functions import col , to_timestamp , lit , row_number
  from pyspark.sql.window import Window
  
  path = '/dbfs/mnt/datalake_raw/batch/view_us/{object}'.format(object=subject)
  save_to = "/mnt/datalake_premium/view_us/{object}".format(object=subject)
  # get most recent files - (today's files)
  fdpaths = [path+"/"+fd for fd in os.listdir(path)]
  files_to_process = []
  for fdpath in fdpaths:
      statinfo = os.stat(fdpath)
      modified_date = datetime.fromtimestamp(statinfo.st_mtime)
      if modified_date.date() == datetime.today().date():
         files_to_process.append(fdpath.replace('/dbfs',''))
  
  # process the data 
  if files_to_process:
    df = (spark.read.format('json')
    .option('inferSchema','true')
    .option("timestampFormat","yyyy-MM-dd'T'hh:mm:SS")
    .load(files_to_process))

    # remove spaces from column names 
    df = df.select([col(x).alias(x.replace(' ','')) for x in df.columns])
    # check if lastUpdated exists 
    if 'LastUpdated' in df.columns:
      df = df.withColumn("LastUpdated",to_timestamp(col("LastUpdated")))

    df = df.drop('ELTLoadDateTime','LoadDateTime').dropDuplicates()
    # write to delta table 
    (df
    .coalesce(1)
    .write
    .format('delta')
    .mode("overwrite")
    .option("overwriteSchema", "true")
    #.option("mergeSchema", "true")
    .save(save_to)
    )

In [0]:
import os
# What are futures?
import concurrent.futures
path = '/dbfs/mnt/datalake_raw/batch/view_us/'
subjects_to_process =  [fd for fd in os.listdir(path)]
# for subject in subjects_to_process:
#   extract_process_view_table(subject)

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
  future_to_tran = {executor.submit(extract_process_view_table, subject): subject for subject in subjects_to_process}
  for future in concurrent.futures.as_completed(future_to_tran):
      tran = future_to_tran[future]
      try:
          result = future.result()
      except Exception as exc:
          print('%r generated an exception: %s' % (tran, exc))

In [0]:
for tbl in subjects_to_process:
  sql_s = "CREATE TABLE if not exists ods.view_us_{tbl} USING DELTA LOCATION '/mnt/datalake_premium/view_us/{tbl}'".format(tbl=tbl)
  
  try:
    spark.sql(sql_s)
  except Exception as exc:
    print('%r generated an exception: %s' % (sql_s, exc))

In [0]:
dbutils.notebook.exit("Job Completed Successfuly!")

Job Completed Successfuly!