In [0]:
from pyspark.sql.functions import col

In [0]:
df = spark.table("ecommerce.gold.gold_events") \
          .filter(col("event_type") == "purchase") \
          .select("price", "user_id") \
          .dropna()

In [0]:
pdf = df.sample(fraction=0.1, seed=42).toPandas()

In [0]:
from sklearn.model_selection import train_test_split

X = pdf[["user_id"]]
y = pdf["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [0]:
import mlflow
import mlflow.sklearn

In [0]:
mlflow.set_experiment("/ecommerce-price-prediction")

2026/01/20 08:48:27 INFO mlflow.tracking.fluent: Experiment with name '/ecommerce-price-prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/4122105303777729', creation_time=1768898907633, experiment_id='4122105303777729', last_update_time=1768898907633, lifecycle_stage='active', name='/ecommerce-price-prediction', tags={'mlflow.experiment.sourceName': '/ecommerce-price-prediction',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'parthpatoliya001@gmail.com',
 'mlflow.ownerId': '77838657344022'}>

In [0]:
from sklearn.metrics import mean_squared_error
import numpy as np

with mlflow.start_run():
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))

    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "user_id")

    # Log metrics
    mlflow.log_metric("rmse", rmse)

    # Log model
    mlflow.sklearn.log_model(model, "price_model")



In [0]:
from pyspark.sql.functions import hour, dayofweek, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
events = spark.table("ecommerce.silver.silver_events") \
    .filter(col("event_type") == "purchase") \
    .select("event_time", "event_type", "price") \
    .dropna()


In [0]:
events = events \
    .withColumn("hour", hour("event_time")) \
    .withColumn("day_of_week", dayofweek("event_time"))


In [0]:
indexer = StringIndexer(
    inputCol="event_type",
    outputCol="event_type_index"
)

events = indexer.fit(events).transform(events)


In [0]:
assembler = VectorAssembler(
    inputCols=["event_type_index", "hour", "day_of_week"],
    outputCol="features"
)

final_df = assembler.transform(events) \
    .select("features", "price")


In [0]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [0]:
lr = LinearRegression(
    featuresCol="features",
    labelCol="price"
)

model = lr.fit(train_df)

In [0]:
predictions = model.transform(test_df)

evaluator = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2"
)

r2 = evaluator.evaluate(predictions)
print(f"R² Score: {r2:.3f}")

R² Score: 0.000


In [0]:
display(
    predictions
    .select("price", "prediction")
    .limit(20)
)

price,prediction
15.93,285.18964696650505
33.42,285.18964696650505
39.9,285.18964696650505
50.19,285.18964696650505
56.63,285.18964696650505
89.29,285.18964696650505
126.18,285.18964696650505
164.48,285.18964696650505
360.34,285.18964696650505
373.21,285.18964696650505
