In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from collections import Counter

In [3]:
spark = SparkSession.builder.appName("CollaborativeBookSong").getOrCreate()

In [4]:
df = spark.read.parquet("hdfs://localhost:9000/golden/matched_user_book_song.parquet")

In [5]:
user_indexer = StringIndexer(inputCol="user_id", outputCol="userIndex")
song_indexer = StringIndexer(inputCol="song_id", outputCol="songIndex")
df = user_indexer.fit(df).transform(df)
df = song_indexer.fit(df).transform(df)
df = df.withColumn("interaction", col("songIndex") * 0 + 1)

In [6]:
als = ALS(
    userCol="userIndex",
    itemCol="songIndex",
    ratingCol="interaction",
    coldStartStrategy="drop",
    nonnegative=True,
    implicitPrefs=True
)
als_model = als.fit(df)

In [7]:
def hybrid_recommendation(df_spark, als_model, user_id_str, target_book_id, top_n_users=5, top_n_songs=5):
    user_books_df = df_spark.select("user_id", "book_id").distinct()
    user_books = user_books_df.groupBy("user_id").agg(collect_list("book_id").alias("books"))
    user_books_dict = {row["user_id"]: set(row["books"]) for row in user_books.collect()}

    if user_id_str not in user_books_dict:
        print(f"User {user_id_str} not found.")
        return []

    target_books = user_books_dict[user_id_str]
    similarities = []
    for other_user, books in user_books_dict.items():
        if other_user == user_id_str:
            continue
        shared_books = len(target_books.intersection(books))
        similarities.append((other_user, shared_books))

    top_users = [u for u, _ in sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n_users]]

    filtered_songs_df = df_spark.filter((col("user_id").isin(top_users)) & (col("book_id") == target_book_id))
    song_list = filtered_songs_df.select("song_id").rdd.flatMap(lambda x: x).collect()
    user_based_top_songs = set([s for s, _ in Counter(song_list).most_common(top_n_songs * 2)])

    song_id_map = df_spark.select("song_id", "songIndex").distinct()
    index_to_id = {row["songIndex"]: row["song_id"] for row in song_id_map.collect()}

    user_index_row = df_spark.select("user_id", "userIndex").distinct().filter(col("user_id") == user_id_str).collect()
    if not user_index_row:
        print(f"User {user_id_str} not found in ALS index.")
        return []

    user_index = user_index_row[0]["userIndex"]
    user_df = spark.createDataFrame([Row(userIndex=user_index)])
    als_recs = als_model.recommendForUserSubset(user_df, top_n_songs * 3)
    rec_indexes = [r["songIndex"] for r in als_recs.collect()[0]["recommendations"]]
    als_song_ids = [index_to_id[idx] for idx in rec_indexes if idx in index_to_id]

    final_recs = [song for song in als_song_ids if song in user_based_top_songs][:top_n_songs]
    return final_recs


In [8]:
target_user = "d902ea288f544c3a74f0fc247144223d"
target_book = "1934"

recommendations = hybrid_recommendation(df, als_model, target_user, target_book)
print(f"Recommended songs for user '{target_user}' and book '{target_book}': {recommendations}")

Recommended songs for user 'd902ea288f544c3a74f0fc247144223d' and book '1934': ['Guilty', 'White Mountain']
