### Imports

In [None]:
import pandas as pd
import pyspark
import json
import os
import re
from pyspark.sql import SparkSession

In [None]:
# Get metadata for tables to check | Location can be parameterised

metadata_path = '..\\output_obj\\tbl_metadata.json'

with open(metadata_path, "r") as f:
    metadata = json.load(f)

print("Loaded JSON data:", metadata)

### Setup spark session

In [None]:
spark = SparkSession.builder \
    .appName("MyDockerSparkApp") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.0,com.microsoft.azure:azure-storage:8.6.6") \
    .getOrCreate()

# Debug via docker
spark.conf.set("fs.azure.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
print("PySpark version:", pyspark.__version__)
hadoop_version = spark.sparkContext._jvm.org.apache.hadoop.util.VersionInfo.getVersion()
print("Hadoop version:", hadoop_version)  

### Read from UC

In [None]:
# Catalog and schema information | # Can optionally be used for schema and catalog cross checks
catalog_name = 'data_foundation_dev'
schema_name = 'raw'

In [None]:
# Get sql files with create table stmt and iterate through sql folder to run cts

sql_folder_path = '..\\sql_files_normalised\\'

for file in os.listdir(sql_folder_path):
    if file.endswith('.sql'):
        with open(sql_folder_path+file, "r") as f:
            sql_stmt = f.read()
            ctas_pattern = r"(?i)create\s+(?:or replace\s+)?(?P<is_temp>temp|temporary)?\s*(?P<obj_type>table|view)\s+(?P<obj_name>[\w\.\[\]]+)\s+(?P<ctas>(?:as\s+)?select\s*)"
            if re.match(ctas_pattern, sql_stmt):
                # Can optionally include schema and catalog cross checks
                create = spark.sql_stmt(file)
                print(f"{file} table created")
            else:
                print(f"{file} create table statement format is incorrect")

