In [None]:
import pyspark
import json
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql.fumctions import current_timestamp
from delta_writer import add_key_hash, delta_overwrite, delta_upsert

### Setup spark session

In [None]:
spark = SparkSession.builder \
    .appName("MyDockerSparkApp") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.0,com.microsoft.azure:azure-storage:8.6.6") \
    .getOrCreate()

# Debug via docker
spark.conf.set("fs.azure.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
print("PySpark version:", pyspark.__version__)
hadoop_version = spark.sparkContext._jvm.org.apache.hadoop.util.VersionInfo.getVersion()
print("Hadoop version:", hadoop_version)  

### Create df as per SQL logic

In [None]:
# Catalog and schema information | # Can optionally be used for schema and catalog cross checks
catalog_name = 'data_foundation_dev'
schema_name = 'raw'

In [None]:
# Get sql files with transformation logic to create aggregate tables

sql_folder_path = '..\\sql_files_normalised\\'

for file in os.listdir(sql_folder_path):
    if file.endswith('.sql'):
        with open(sql_folder_path+file, "r") as f:
            sql_stmt = f.read()

        tbl_name = file.strip('.sql')
        env = spark.sql(f"use catalog {catalog_name}")
        df = spark.sql(sql_stmt)

        three_part_name = catalog_name + schema_name + tbl_name

# Modify logic to accomodate composite keys and more complex SCD logic
df = add_key_hash(df, df.columns[0])
df = df.withColumn('_modified_date', current_timestamp())

#### Write to UC

In [None]:
spark.conf.set("spark.sql.ansi.enabled", "false")

In [None]:
# Write configs
exists_table = True
drop_table = False
archive = 'upsert'

In [None]:

# Execute write operation | Modify for additonal SCD functionality
if exists_table and not drop_table and archive.lower() in ('append', 'force_append'):
    print("Write mode: Append")
    df.write.insertInto(f"{three_part_name}", overwrite=False)

elif exists_table and not drop_table and archive.lower() == 'upsert':
    print("Write mode: Upsert")
    delta_upsert(df, three_part_name)

else:
    print("Write mode: Overwrite")
    delta_overwrite(df, three_part_name)