In [0]:
promotion_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiLine", True)
    .csv("/mnt/raw-bronze/promotion.csv")
)
#display(promotion_df)
promotion_df.createOrReplaceTempView("promotion")

In [0]:
from pyspark.sql.functions import col, lit
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# clean and Prepare Categorical Columns ---

# Fill any null or missing string values with 'NONE' for safe indexing
promotion_df = promotion_df.fillna("NONE", subset=["feature", "display"])

# String Indexing for Categorical Features ---

# Indexer for the 'feature' column (e.g., 'Interior Page Feature', 'Not on Feature')
indexer_feature = StringIndexer(
    inputCol="feature",
    outputCol="feature_indexed",
    handleInvalid='keep'
)

# Indexer for the 'display' column (e.g., 'Mid-Aisle End Cap', 'Not on Display')
indexer_display = StringIndexer(
    inputCol="display",
    outputCol="display_indexed",
    handleInvalid='keep'
)

assembler = VectorAssembler(
    inputCols=["display_indexed"], 
    outputCol="type_vec"
)

# Execute Pipeline
pipeline_promo = Pipeline(stages=[indexer_feature, indexer_display, assembler])
pipeline_model_promo = pipeline_promo.fit(promotion_df)
promotion_df_silver = pipeline_model_promo.transform(promotion_df)

promotion_df_silver.printSchema()

In [0]:
# Write the DataFrame to the mounted 'silver' path
output_path = "/mnt/silver/promotion/"

promotion_df_silver.write.parquet(
    output_path,
    mode="overwrite"
)