In [0]:
supermarkets_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiLine", True)
    .csv("/mnt/raw-bronze/supermarkets.csv")
)
#display(supermarkets_df)
supermarkets_df.createOrReplaceTempView("supermarkets")

In [0]:
from pyspark.sql.functions import col, lit, when
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StringType



# --- 1. Rename and Initial Cleaning ---

# Rename the primary key column to match the sales table
supermarkets_df = supermarkets_df.withColumnRenamed("supermarket_No", "supermarket")

# Convert postal-code to string type explicitly to ensure StringIndexer treats it categorically
supermarkets_df = supermarkets_df.withColumn("postal_code", col("postal-code").cast(StringType()))

# Fill nulls/missing with 'UNKNOWN' for safe indexing
supermarkets_df = supermarkets_df.fillna("UNKNOWN", subset=["postal_code"])

# --- 2. String Indexing for Postal Code ---

# Postal Code is a high-cardinality nominal feature (like a city or address) 
# and requires indexing, not OHE.
indexer_postal = StringIndexer(
    inputCol="postal_code",
    outputCol="postal_code_indexed",
    handleInvalid='keep'
)

# Execute the Indexer
indexer_model_postal = indexer_postal.fit(supermarkets_df)
supermarkets_df_silver = indexer_model_postal.transform(supermarkets_df)

# --- 3. Final Supermarkets DataFrame for Silver Layer ---
supermarkets_df_silver = supermarkets_df_silver.drop("postal-code")





In [0]:
# Write the DataFrame to the mounted silver path
output_path = "/mnt/silver/supermarkets/"

supermarkets_df_silver.write.parquet(
    output_path,
    mode="overwrite"
)