In [0]:
import sys
import os

In [0]:
%run ../../utils/utils.py


In [0]:
def ingest_table(file_key, table_name, raw_schema, primary_key=None, file_path=None):
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {raw_schema}")

    if file_path is None:
        raise ValueError("file_path must be provided")

    print(f"Loading {file_key} from {file_path}")

    df = spark.read.csv(file_path, header=True, inferSchema=True)

    if primary_key:
        check_duplicates(df, primary_key, table_name)

    target_table = f"{raw_schema}.{table_name}"
    print(f"Writing to {target_table}")

    df.write.format("delta").mode("overwrite").saveAsTable(target_table)
    print(f"{table_name} ingested successfully.\n")


In [0]:
# load config
current_dir = os.getcwd()
config_path = os.path.join(current_dir, '..', '..', '..', 'config', 'config.yaml')
config_path = os.path.normpath(config_path)
config, root_path, master_data_folder, master_data_files = load_config(config_path)
raw_schema = config['raw_layer']['schema']
print("Master data file keys:", master_data_files.keys())

# ingest to data lake
for table_key, table_info in master_data_files.items():
    try:
        print(f"Processing {table_key}...")
        filename = table_info['filename']
        primary_key = table_info.get('primary_key')
        
        path = get_full_path(table_info, root_path, master_data_folder)  # get full file path
        
        ingest_table(table_key, table_key, raw_schema=raw_schema, primary_key=primary_key, file_path=path)
    except Exception as e:
        print(f"Error ingesting {table_key}: {e}")

