In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from collections import Counter

In [3]:
spark = SparkSession.builder.appName("CollaborativeBookSong").getOrCreate()

In [4]:
df = spark.read.parquet("hdfs://localhost:9000/golden/matched_user_book_song.parquet")

In [5]:
user_indexer = StringIndexer(inputCol="user_id", outputCol="userIndex")
book_indexer = StringIndexer(inputCol="book_id", outputCol="bookIndex")

df = user_indexer.fit(df).transform(df)
df = book_indexer.fit(df).transform(df)

In [6]:
df = df.withColumn("interaction", col("bookIndex") * 0 + 1)

als = ALS(
    userCol="userIndex",
    itemCol="bookIndex",
    ratingCol="interaction",
    coldStartStrategy="drop",
    nonnegative=True,
    implicitPrefs=True
)
als_model = als.fit(df)

In [7]:
def recommend_for_user_and_book(df_spark, target_user, target_book, top_n_users=3, top_n_songs=3):
    user_books_df = df_spark.select("user_id", "book_id").distinct()
    user_books = user_books_df.groupBy("user_id").agg(collect_list("book_id").alias("books"))
    user_books_dict = {row["user_id"]: set(row["books"]) for row in user_books.collect()}

    if target_user not in user_books_dict:
        print(f"User {target_user} not found.")
        return []

    target_books = user_books_dict[target_user]

    similarities = []
    for other_user, books in user_books_dict.items():
        if other_user == target_user:
            continue
        shared_books = len(target_books.intersection(books))
        similarities.append((other_user, shared_books))

    top_users = [u for u, _ in sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n_users]]

    songs = df_spark.filter((col("user_id").isin(top_users)) & (col("book_id") == target_book)) \
                    .select("song_id").rdd.flatMap(lambda x: x).collect()

    top_songs = [s for s, _ in Counter(songs).most_common(top_n_songs)]
    return top_songs

In [8]:
target_user = "d902ea288f544c3a74f0fc247144223d"
target_book = "1934"

recommendations = recommend_for_user_and_book(df, target_user, target_book)
print(f"Recommended songs for user '{target_user}' and book '{target_book}': {recommendations}")

Recommended songs for user 'd902ea288f544c3a74f0fc247144223d' and book '1934': ['Happy X-mas', 'Guilty', 'White Mountain']
