In [0]:
events = spark.table("ecommerce.silver.silver_events")

In [0]:
from pyspark.sql.functions import hour, dayofweek

events_ml = events \
    .withColumn("hour", hour("event_time")) \
    .withColumn("day_of_week", dayofweek("event_time")) \
    .filter(events.price.isNotNull())

In [0]:
train_df, test_df = events_ml.randomSplit([0.5, 0.5], seed=42)

In [0]:
print("Rows:", events.count())
print("Train:", train_df.count())
print("Test:", test_df.count())

Rows: 109819992
Train: 54914763
Test: 54905229


In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [0]:
indexer = StringIndexer(
    inputCol="event_type",
    outputCol="event_type_index"
)
assembler = VectorAssembler(
    inputCols=["event_type_index", "hour", "day_of_week"],
    outputCol="features"
)

In [0]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor

In [0]:
lr = LinearRegression(labelCol="price")

dt = DecisionTreeRegressor(
    labelCol="price",
    featuresCol="features",
    maxDepth=3   
)
rf = RandomForestRegressor(
    labelCol="price",
    featuresCol="features",
    numTrees=10,     
    maxDepth=5,     
    seed=42
)

In [0]:
lr_pipeline = Pipeline(stages=[indexer, assembler, lr])
dt_pipeline = Pipeline(stages=[indexer, assembler, dt])
rf_pipeline = Pipeline(stages=[indexer, assembler, rf])

In [0]:
import mlflow
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
evaluator = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2"
)

In [0]:
models = {
    "Linear Regression": lr_pipeline,
    "Decision Tree": dt_pipeline,
    "Random Forest": rf_pipeline
}

In [0]:
for name, pipeline in models.items():
    with mlflow.start_run(run_name=name):
        model = pipeline.fit(train_df)
        preds = model.transform(test_df)
        r2 = evaluator.evaluate(preds)

        mlflow.log_metric("r2_score", r2)
        mlflow.spark.log_model(model, "model", dfs_tmpdir="/Volumes/workspace/default/ml_flow_models")

        print(f"{name} R²: {r2:.3f}")



Linear Regression R²: 0.000




Decision Tree R²: 0.001




Random Forest R²: 0.001
