In [9]:
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import hashlib
import datetime
import urllib.request
import json
from datetime import timedelta, date

StatementMeta(, d78fa155-55b8-4982-a134-db4d77e75159, 11, Finished, Available, Finished)

In [10]:
ABFSS_FOLDER = "2025/02/14"

StatementMeta(, d78fa155-55b8-4982-a134-db4d77e75159, 12, Finished, Available, Finished)

In [None]:
# Global configuration for paths
UPDATED = datetime.datetime.today().replace(second=0, microsecond=0)
SILVER_PATH = "Tables"
BRONZE_PATH = "Files/bronze"

In [11]:
def get_silver_table_path(table_name):
    """
    Constructs the full path for a silver Delta table.
    """
    return f"{SILVER_PATH}/{table_name}"

def create_blank_df(spark, schema):
    """
    Returns an empty DataFrame with the specified schema.
    """
    return spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
def upsert_to_delta_generic(spark, df_final, table_name, schema, join_key):
    """
    Upserts a DataFrame into a Delta table generically, supporting single or composite join keys.
    
    Parameters:
      - spark: SparkSession.
      - df_final: DataFrame containing data to merge.
      - table_name: Target table name.
      - schema: Target table schema (StructType).
      - join_key: A string or list of columns used as the merge key.
    """
    table_path = f"{SILVER_PATH}/{table_name}"
    
    try:
        delta_table = DeltaTable.forPath(spark, table_path)
    except Exception as e:
        print(f"Delta table at path {table_path} not found. Creating new table...")
        spark.sql(f"DROP TABLE IF EXISTS {table_name}")
        create_blank_df(spark, schema).write.format("delta").mode("overwrite").saveAsTable(f"{table_name}")
        delta_table = DeltaTable.forPath(spark, table_path)
    
    # Build merge condition (supporting composite join keys)
    if isinstance(join_key, list):
        merge_condition = " AND ".join([f"tgt.{k} = src.{k}" for k in join_key])
    else:
        merge_condition = f"tgt.{join_key} = src.{join_key}"
    
    # Build update and insert mappings (skip join key(s) for update)
    update_set = {
        col: f"src.{col}" for col in df_final.columns
        if (isinstance(join_key, list) and col not in join_key) or (not isinstance(join_key, list) and col != join_key)
    }
    insert_values = {col: f"src.{col}" for col in df_final.columns}
    
    delta_table.alias("tgt").merge(
        df_final.alias("src"),
        merge_condition
    ).whenMatchedUpdate(set=update_set
    ).whenNotMatchedInsert(values=insert_values
    ).execute()

StatementMeta(, d78fa155-55b8-4982-a134-db4d77e75159, 13, Finished, Available, Finished)

In [16]:
customers_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("created_date", TimestampType(), True),
    StructField("updated_date", TimestampType(), True),
    StructField("updated_at", TimestampType(), True)
])
locations_schema = StructType([
    StructField("location_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("county", StringType(), True),
    StructField("state_code", StringType(), True),
    StructField("state", StringType(), True),
    StructField("type", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("created_date", TimestampType(), True),
    StructField("updated_date", TimestampType(), True),
    StructField("updated_at", TimestampType(), True)
])
products_schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("cost", LongType(), True),
    StructField("original_sale_price", LongType(), True),
    StructField("discount", LongType(), True),
    StructField("current_price", LongType(), True),
    StructField("taxes", DoubleType(), True),
    StructField("created_date", TimestampType(), True),
    StructField("updated_date", TimestampType(), True),
    StructField("updated_at", TimestampType(), True)
])
sales_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("location_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("quantity", LongType(), True),
    StructField("price", LongType(), True),
    StructField("created_date", TimestampType(), True),
    StructField("updated_date", TimestampType(), True),
    StructField("updated_at", TimestampType(), True)
])

StatementMeta(, d78fa155-55b8-4982-a134-db4d77e75159, 18, Finished, Available, Finished)

In [13]:
def process_customers_table(spark, customer_path, customer_schema):
    # Read raw CSV data
    df_raw = spark.read.csv(customer_path, schema=customer_schema, header=True)

    window_spec = Window.partitionBy("customer_id").orderBy(F.desc("updated_date"))

    df_dedup = (df_raw
                .withColumn("row_number", F.row_number().over(window_spec))
                .filter(F.col("row_number") == 1)
                .drop("row_number"))

    # Add updated_at column
    df_final = df_dedup.withColumn("updated_at", F.lit(UPDATED))

    return df_final

def process_locations_table(spark, locations_path, locations_schema):
    # Read raw CSV data
    df_raw = spark.read.csv(locations_path, schema=locations_schema, header=True)
    
    # Define window specification for deduplication based on location_id
    window_spec = Window.partitionBy("location_id").orderBy(F.desc("updated_date"))
    
    # Deduplicate records
    df_dedup = (df_raw
                .withColumn("row_number", F.row_number().over(window_spec))
                .filter(F.col("row_number") == 1)
                .drop("row_number"))
    
    # Add updated_at column
    df_final = df_dedup.withColumn("updated_at", F.lit(UPDATED))
    
    return df_final

def process_products_table(spark, products_path, products_schema):
    # Read raw CSV data
    df_raw = spark.read.csv(products_path, schema=products_schema, header=True)
    
    # Define window specification for deduplication based on product_id
    window_spec = Window.partitionBy("product_id").orderBy(F.desc("updated_date"))
    
    # Deduplicate records
    df_dedup = (df_raw
                .withColumn("row_number", F.row_number().over(window_spec))
                .filter(F.col("row_number") == 1)
                .drop("row_number"))
    
    # Add updated_at column
    df_final = df_dedup.withColumn("updated_at", F.lit(UPDATED))
    
    return df_final

def process_sales_table(spark, sales_path, sales_schema):
    # Read raw CSV data
    df_raw = spark.read.csv(sales_path, schema=sales_schema, header=True)
    
    # Define window specification for deduplication based on order_id
    window_spec = Window.partitionBy("order_id").orderBy(F.desc("updated_date"))
    
    # Deduplicate records
    df_dedup = (df_raw
                .withColumn("row_number", F.row_number().over(window_spec))
                .filter(F.col("row_number") == 1)
                .drop("row_number"))
    
    # Add updated_at column
    df_final = df_dedup.withColumn("updated_at", F.lit(UPDATED))
    
    return df_final

StatementMeta(, d78fa155-55b8-4982-a134-db4d77e75159, 15, Finished, Available, Finished)

In [17]:
# Process Customers Table
customers_file_path = f"{BRONZE_PATH}/customers/{ABFSS_FOLDER}"
df_customers_final = process_customers_table(spark, customers_file_path, customers_schema)

upsert_to_delta_generic(
    spark, 
    df_final=df_customers_final, 
    table_name="customers", 
    schema=customers_schema, 
    join_key="customer_id"
)
# Process Locations Table
locations_file_path = f"{BRONZE_PATH}/locations/{ABFSS_FOLDER}"
df_locations_final = process_locations_table(spark, locations_file_path, locations_schema)

upsert_to_delta_generic(
    spark, 
    df_final=df_locations_final, 
    table_name="locations", 
    schema=locations_schema, 
    join_key="location_id"
)

# Process Products Table
products_file_path = f"{BRONZE_PATH}/products/{ABFSS_FOLDER}"
df_products_final = process_products_table(spark, products_file_path, products_schema)

upsert_to_delta_generic(
    spark, 
    df_final=df_products_final, 
    table_name="products", 
    schema=products_schema, 
    join_key="product_id"
)

# Process Sales Table
sales_file_path = f"{BRONZE_PATH}/sales/{ABFSS_FOLDER}"
df_sales_final = process_sales_table(spark, sales_file_path, sales_schema)

upsert_to_delta_generic(
    spark, 
    df_final=df_sales_final, 
    table_name="sales", 
    schema=sales_schema, 
    join_key="order_id"
)


StatementMeta(, d78fa155-55b8-4982-a134-db4d77e75159, 19, Finished, Available, Finished)

Delta table at path Tables/sales not found. Creating new table...
