In [0]:
# Define table configurations
tables = {
    "promotion": "promotion/",
    "item": "items/",
    "sales": "sales/",
    "supermarkets": "supermarkets/"  
}

base_path = "/mnt/silver/"

# Create dataframes and views in a loop
dataframes = {}
for table_name, folder in tables.items():
    df = (
        spark.read
        .option("header", True)
        .option("inferSchema", True)
        .option("multiLine", True)
        .parquet(f"{base_path}{folder}")
    )
    
    # Store in dictionary for later use
    dataframes[table_name] = df
    
    # Create temp view
    df.createOrReplaceTempView(table_name)
    
    print(f"Created view: {table_name}")


promotion_df = dataframes["promotion"]
item_df = dataframes["item"]
sales_df = dataframes["sales"]
supermarkets_df = dataframes["supermarkets"]

In [0]:
gold_df = spark.sql("""
SELECT
    -- SALES/TEMPORAL FEATURES
      S.code
    , S.supermarket
    , S.week
    , S.province
    , S.amount AS sales_amount
    , S.units AS sales_units
    , S.hour_sin
    , S.hour_cos
    , S.voucher
    , S.province_1
    , S.province_2
    , S.cycle_day_1
    , S.cycle_day_2
    , S.cycle_day_3
    , S.cycle_day_4
    , S.cycle_day_5
    , S.cycle_day_6
    , S.cycle_day_7

    -- ITEM FEATURES (Now including the guaranteed type_indexed)
    , I.descrption
    , I.size_value
    , I.size_unit_encoded
    , I.brand_indexed
    , I.type_indexed -- This column is now guaranteed to exist

    -- SUPERMARKET FEATURES
    , SM.postal_code_indexed

    -- PROMOTION FEATURES (Now including the guaranteed display_indexed)
    , P.feature_indexed AS promo_feature_indexed
    , P.display_indexed AS promo_display_indexed

FROM
    sales S

LEFT JOIN
    item I ON S.code = I.code

LEFT JOIN
    supermarkets SM ON S.supermarket = SM.supermarket

LEFT JOIN
    promotion P ON 
        S.code = P.code AND
        S.supermarket = P.supermarkets 

WHERE P.feature_indexed is not null
""")

print("--- Final Gold Layer DataFrame Schema (From Spark SQL) ---")
#gold_df.printSchema()
display(gold_df)

In [0]:
output_path = "/mnt/gold/gold/"

gold_df.write.parquet(
    output_path,
    mode="overwrite"
)