In [0]:
final_als_df = spark.table("default.als_interactions")

In [0]:
train_df, test_df = final_als_df.randomSplit(
    [0.8, 0.2],
    seed=42
)

In [0]:
print("Training rows:", train_df.count())
print("Testing rows:", test_df.count())

Training rows: 10645629
Testing rows: 2662324


In [0]:
from pyspark.ml.recommendation import ALS

In [0]:
als = ALS(
    userCol="user_index",
    itemCol="product_index",
    ratingCol="interaction",
    implicitPrefs=True,          # because purchases â‰  ratings
    rank=10,                     # number of latent factors
    maxIter=10,                  # training iterations
    regParam=0.1,                # regularization
    alpha=1.0,                   # confidence for implicit data
    coldStartStrategy="drop"     # handle unknown users/items
)

In [0]:
als_model = als.fit(train_df)

In [0]:
als_model

ALSModel: uid=ALS_3f584657913c, rank=10

In [0]:
predictions = als_model.transform(test_df)

In [0]:
coldStartStrategy="drop"

In [0]:
predictions.select(
    "user_index",
    "product_index",
    "interaction",
    "prediction"
).show(10)


+----------+-------------+-----------+----------+
|user_index|product_index|interaction|prediction|
+----------+-------------+-----------+----------+
|         0|           28|         18| 1.0326204|
|         0|           47|          2|0.42531893|
|         0|           53|          3| 1.0497887|
|         0|           95|          1|0.60361636|
|         0|          109|          7| 1.0525576|
|         0|          118|          3|0.42654157|
|         0|          130|         15| 0.7563789|
|         0|          142|          5|0.65933615|
|         0|          156|          9| 0.6843842|
|         0|          182|         13|0.90171415|
+----------+-------------+-----------+----------+
only showing top 10 rows


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="interaction",
    predictionCol="prediction"
)

In [0]:
rmse = evaluator.evaluate(predictions)
print(f"RMSE = {rmse}")

RMSE = 4.195985387874105


In [0]:
sample_preds = als_model.transform(test_df.limit(1000))

In [0]:
sample_preds.orderBy("prediction", ascending=False).show(10)

+----------+-------------+-----------+----------+
|user_index|product_index|interaction|prediction|
+----------+-------------+-----------+----------+
|         9|           11|         38| 1.2260529|
|        27|           85|          1| 1.1950097|
|        16|           11|         15| 1.1224632|
|        22|            0|          9| 1.1038705|
|        19|          340|          6| 1.0908461|
|         0|          109|          7| 1.0525576|
|         0|           53|          3| 1.0497887|
|        32|           26|         10| 1.0489556|
|        10|           13|          4| 1.0399313|
|         0|           28|         18| 1.0326204|
+----------+-------------+-----------+----------+
only showing top 10 rows
