In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list
from collections import Counter

In [3]:
spark = SparkSession.builder.appName("CollaborativeBookSong").getOrCreate()

In [4]:
df = spark.read.parquet("hdfs://localhost:9000/golden/matched_user_book_song.parquet")

In [5]:
def recommend_collaborative_for_book(df_spark, target_book, top_n_users=3, top_n_songs=3):
    users_for_book = df_spark.filter(col("book_id") == target_book).select("user_id").distinct()
    user_ids = [row["user_id"] for row in users_for_book.collect()]

    user_books_df = df_spark.select("user_id", "book_id").distinct()
    user_books = user_books_df.groupBy("user_id").agg(collect_list("book_id").alias("books"))
    user_books_dict = {row["user_id"]: set(row["books"]) for row in user_books.collect()}

    similarities = []
    for u in user_ids:
        u_books = user_books_dict[u]
        shared = sum(len(u_books.intersection(user_books_dict[o])) for o in user_ids if o != u)
        similarities.append((u, shared))

    top_users = [u for u, _ in sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n_users]]

    songs = df_spark.filter((col("user_id").isin(top_users)) & (col("book_id") == target_book)) \
                    .select("song_id").rdd.flatMap(lambda x: x).collect()

    top_songs = [s for s, _ in Counter(songs).most_common(top_n_songs)]
    return top_songs

In [6]:
book = "1934"
recommendations = recommend_collaborative_for_book(df, target_book=book)
print(f"Recommended songs for the book '{book}': {recommendations}")

Recommended songs for the book '1934': ['Dominican Mami', 'Happy X-mas', 'Speakeasy']
