In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list
from pyspark.ml.feature import StringIndexer
from collections import Counter

In [3]:
spark = SparkSession.builder.appName("CollaborativeBookSong").getOrCreate()

In [4]:
df = spark.read.parquet("hdfs://localhost:9000/golden/matched_user_book_song.parquet")

In [5]:
def recommend_for_user_and_book(df_spark, target_user, target_book, top_n_users=5, top_n_songs=5):
    user_books_df = df_spark.select("user_id", "book_id").distinct()
    user_books = user_books_df.groupBy("user_id").agg(collect_list("book_id").alias("books"))
    user_books_dict = {row["user_id"]: set(row["books"]) for row in user_books.collect()}

    if target_user not in user_books_dict:
        print(f"User {target_user} not found.")
        return []

    target_books = user_books_dict[target_user]

    similarities = []
    for other_user, books in user_books_dict.items():
        if other_user == target_user:
            continue
        shared_books = len(target_books.intersection(books))
        similarities.append((other_user, shared_books))

    top_users = [u for u, _ in sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n_users]]

    songs = df_spark.filter((col("user_id").isin(top_users)) & (col("book_id") == target_book)) \
                    .select("song_id").rdd.flatMap(lambda x: x).collect()

    top_songs = [s for s, _ in Counter(songs).most_common(top_n_songs)]
    return top_songs

In [6]:
target_user = "d8f55c9e774ddb880968a1ee57e3b86d"
target_book = "37470"

recommendations = recommend_for_user_and_book(df, target_user, target_book)
print(f"Recommended songs for user '{target_user}' and book '{target_book}': {recommendations}")

Recommended songs for user 'd8f55c9e774ddb880968a1ee57e3b86d' and book '37470': ['Speakeasy', 'Money On Fleek', 'Sweet Thing', 'Guilty', 'You']
