In [0]:
import yaml
import os
from pyspark.sql.functions import col, count
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, StringType, ArrayType, BooleanType, DoubleType, DateType, TimestampType
)

In [0]:
import yaml

def load_config(config_path):
    """
    Loads YAML config file and returns:
    - full config dict
    - root_path (str)
    - master_data_folder (str)
    - master_data_files (dict with nested filename and primary_key)
    """
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print(f"Loaded config from {config_path}")

    root_path = config['paths']['root_path']
    master_data_folder = config['paths']['master_data_folder']
    master_data_files = config['master_data_files']

    return config, root_path, master_data_folder, master_data_files


In [0]:
def get_full_path(table_info, root_path, master_data_folder):
    """
    Constructs the full file path for a given table_info dict containing 'filename',
    using the root_path and master_data_folder from config.
    """    
    filename = table_info['filename']
    return os.path.join(root_path, master_data_folder, filename)

In [0]:
def check_duplicates(df, pk_col, table_name):
    """
    Checks for duplicate primary keys in a DataFrame and raises an error if found.
    """
    duplicates = df.groupBy(pk_col).agg(count("*").alias("cnt")).filter(col("cnt") > 1)
    if duplicates.count() > 0:
        print(f"Found duplicate(s) in {table_name} on {pk_col}:")
        duplicates.show(truncate=False)
        df = df.dropDuplicates([primary_key])
        print(f"Dropped duplicates") # we dont raise error for the shake of the exercise

    else:
        print(f"[OK] No duplicates found in {table_name} on {pk_col}")

In [0]:
def check_empty(df, table_name):
    if len(df.head(1)) == 0: # erverless compute in Databricks does not support .rdd
        raise ValueError(f"[ERROR] Dataset '{table_name}' is empty!")


In [0]:
def parse_schema(schema_config):
    """
    Converts a list of column specs from config into a Spark StructType
    """
    type_map = {
    "IntegerType": IntegerType(),
    "StringType": StringType(),
    "DoubleType": DoubleType(),
    "DateType": DateType(),
    "TimestampType": TimestampType(),
    "BooleanType": BooleanType()
    }

    fields = []
    for col_spec in schema_config:
        col_name = col_spec['name']
        col_type_str = col_spec['type']
        nullable = col_spec.get('nullable', True)

        if col_type_str == "ArrayType":
            # Recursively parse elementType
            element_schema_config = col_spec.get('elementType', [])
            element_type = parse_schema(element_schema_config)
            col_type = ArrayType(element_type, containsNull=nullable)
        else:
            col_type = type_map.get(col_type_str)
            if col_type is None:
                raise ValueError(f"Unknown type {col_type_str} in schema config")

        fields.append(StructField(col_name, col_type, nullable))

    return StructType(fields)