In [0]:
# Import necessary libraries
from pyspark.sql import functions as F

# Load October raw data
october_df = spark.read.csv("/Volumes/workspace/advecom/advecom_data/2019-Oct.csv", header=True, inferSchema=True)

# Create interaction DataFrame, comprising of user-product interactions
interaction_df = october_df.withColumn(
    "rating",
    F.when(F.col("event_type") == "purchase", 3)
     .when(F.col("event_type") == "cart", 2)
     .otherwise(1)
).select("user_id", "product_id", "rating")\
.groupBy("user_id", "product_id") \
.agg(F.sum("rating").alias("rating")) \
.dropna(subset=["user_id", "product_id", "rating"])

interaction_df.show(5)

+---------+----------+------+
|  user_id|product_id|rating|
+---------+----------+------+
|514336739|   1004321|     2|
|555447748|  26201858|     1|
|514805015|   1306185|     3|
|555462074|  12707756|     1|
|547469497|  28718532|     1|
+---------+----------+------+
only showing top 5 rows


In [0]:
# Import necessary libraries
from pyspark.ml.recommendation import ALS

# Create ALS model
als = ALS(
    userCol="user_id",
    itemCol="product_id",
    ratingCol="rating",
    coldStartStrategy="drop",
    implicitPrefs=False  # because we created ratings explicitly
)

# Fit ALS model
als_model = als.fit(interaction_df)

In [0]:
# Create users and items DataFrames
users = interaction_df.select("user_id").distinct().limit(10) # limit to 10 users for demo purposes. As it takes a lot of time to calculate top 5 recommendations due to the size of the dataset (users x items).

items = interaction_df.select("product_id").distinct()

# Create cross DataFrame
cross_df = users.crossJoin(items)

# Create predictions DataFrame
predictions = als_model.transform(cross_df)

display(cross_df)

user_id,product_id
538191958,1005159
530687399,5701087
518524779,17300014
549177765,11100315
515117112,8500290
513281828,21400652
519283918,9800341
513442181,45300001
555854167,27300009
515154404,5100572


In [0]:
# Import necessary libraries
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

# Define the window specification
window_spec = Window.partitionBy("user_id").orderBy(desc("prediction"))

# Calculate the top 5 predictions for each user
top5 = predictions.withColumn(
    "rank",
    row_number().over(window_spec)
).filter("rank <= 5")

top5.show()

+---------+----------+----------+----+
|  user_id|product_id|prediction|rank|
+---------+----------+----------+----+
|513281828|   4500664| 13.151505|   1|
|513281828|  26403130| 11.950272|   2|
|513281828|  12700977| 11.044745|   3|
|513281828|  26300655| 10.295308|   4|
|513281828|  26009172| 10.171152|   5|
|513442181|   4500664| 19.531466|   1|
|513442181|  13100576|    9.4492|   2|
|513442181|  26403130|  8.597161|   3|
|513442181|  12709594|  7.556492|   4|
|513442181|  26009172|  7.324424|   5|
|515117112|   4500664| 39.938553|   1|
|515117112|  12706563| 20.078081|   2|
|515117112|  26019269| 17.751484|   3|
|515117112|  12709594| 15.612735|   4|
|515117112|  26403130|  14.99227|   5|
|518524779|   4500664| 23.996937|   1|
|518524779|  12706563| 14.593942|   2|
|518524779|  26403130|12.2832155|   3|
|518524779|  26403736| 10.401463|   4|
|518524779|  26009172|  9.458663|   5|
+---------+----------+----------+----+
only showing top 20 rows
