In [0]:
ales_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiLine", True)
    .csv("/mnt/raw-bronze/sales.csv")
)
#display(sales_df)
sales_df.createOrReplaceTempView("sales")

In [0]:
from pyspark.sql.functions import when, col, lit

# One-Hot Encode 'province' using simple conditions
sales_df = sales_df.withColumn(
    "province_1", 
    when(col("province") == 1, lit(1)).otherwise(lit(0))
)

sales_df = sales_df.withColumn(
    "province_2", 
    when(col("province") == 2, lit(1)).otherwise(lit(0))
)

# Display the result in Databricks
#display(sales_df.select("province", "province_1", "province_2").limit(5))

In [0]:
from pyspark.sql.functions import col, sin, cos, pi, floor, lpad, concat, substring, lit, when
from pyspark.sql.types import IntegerType


# (Cyclical Encoding)

# Convert the integer 'time' (e.g., 1100) into 'hour' (e.g., 11)
# Method: Convert to string, pad with zeros (e.g., 900 -> '0900'), take the first two chars (hour).
sales_df = sales_df.withColumn("time_str", lpad(col("time").cast("string"), 4, "0"))
sales_df = sales_df.withColumn("hour", col("time_str").substr(1, 2).cast(IntegerType()))

# 2. Calculate Cyclical Sine and Cosine features
HOURS_IN_DAY = 24
sales_df = sales_df.withColumn(
    "hour_sin", 
    sin(2 * pi() * col("hour") / lit(HOURS_IN_DAY))
)
sales_df = sales_df.withColumn(
    "hour_cos", 
    cos(2 * pi() * col("hour") / lit(HOURS_IN_DAY))
)

# Day' Column (Weekly Cycle Index and OHE) ---

# Calculate the Weekly Cycle Index (1=Start of week, 7=End of week)
# Index = ((day - 1) % 7) + 1
sales_df = sales_df.withColumn(
    "day_cycle_index", 
    (col("day") - lit(1)) % lit(7) + lit(1)
)

# One-Hot Encode the Weekly Cycle Index (1 through 7)
for i in range(1, 8):
    sales_df = sales_df.withColumn(
        f"cycle_day_{i}",
        when(col("day_cycle_index") == i, lit(1)).otherwise(lit(0))
    )

# --- Display the Transformed Features ---
print("--- Sales DataFrame with Optimized Temporal Features (PySpark) ---")
sales_df.select(
    "code", "amount", "units", "customerId", "province", "province_1", "province_2","supermarket", "basket", "day", "time", "hour", "hour_sin", "hour_cos",
    "day_cycle_index", "cycle_day_1", "cycle_day_7" # Showing Day 1 and Day 7 for brevity
).limit(10).show()

# NOTE: sales_df now contains the new, optimized columns.

In [0]:
# Write the DataFrame to the mounted 'silver' path
output_path = "/mnt/silver/sales/"

sales_df.write.parquet(
    output_path,
    mode="overwrite"
)