In [0]:
gold_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiLine", True)
    .parquet("/mnt/gold/gold/")
)
#display(gold_df)
gold_df.createOrReplaceTempView("gold")

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

# --- Define ALL Feature Columns ---
feature_cols = [
    # Sales/Temporal Features
    "hour_sin", "hour_cos", 
    "voucher",
    "province_1", "province_2", 
    "cycle_day_1", "cycle_day_2", "cycle_day_3", "cycle_day_4",
    "cycle_day_5", "cycle_day_6", "cycle_day_7",
    
    # Item Features (Where the NaN is likely coming from due to failed extraction or join)
    "size_value",
    "size_unit_encoded", 
    "brand_indexed",     
    "type_indexed",      
    
    # Supermarket/Promotion Features
    "postal_code_indexed", 
    "promo_feature_indexed",
    "promo_display_indexed"
]

# Fill NaNs with 0
fill_cols = [c for c in feature_cols if c != "size_unit_encoded"] # Exclude vector types from direct fill

# Fill numerical columns with 0
gold_df_clean = gold_df.na.fill(0, subset=fill_cols) 

#  Assemble the vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
vector_df = assembler.transform(gold_df_clean)

# Final Data Selection and Splitting
final_ml_df = vector_df.select(
    col("sales_amount").alias("label"), # The target variable
    col("features")                      # The predictor vector
)

# Split the data (80% Train, 20% Test)
seed_value = 42
train_df, test_df = final_ml_df.randomSplit([0.8, 0.2], seed=seed_value)

print(f" Data Vectorized and Split. Training Count: {train_df.count()}, Testing Count: {test_df.count()}")

In [0]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Define the Model ---

rf = RandomForestRegressor(
    labelCol="label", 
    featuresCol="features", 
    numTrees=100, # Number of trees in the forest
    maxDepth=5,   # Maximum depth of the trees (for speed/overfitting control)
    seed=42
)

print("\n--- Training Random Forest Regressor ---")
# --- 2. Train the Model ---
rf_model = rf.fit(train_df)

# --- 3. Make Predictions on the Test Set ---
predictions = rf_model.transform(test_df)

# --- 4. Evaluate the Model ---
evaluator_rmse = RegressionEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="rmse" # Root Mean Squared Error
)

evaluator_r2 = RegressionEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="r2" # R-squared
)

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print("\n--- Model Evaluation Results (Test Set) ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# Optional: Show a sample of predictions vs. actual sales amount
predictions.na.fill(0).select("label", "prediction").limit(5).show()


In [0]:
#print(feature_cols)

In [0]:
rf_model.write().overwrite().save("/mnt/model/sales_rf_model_v1")