In [28]:
import faiss
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# 1. Start Spark
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("hdfs_test") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.task.cpus", "4") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.rapids.memory.gpu.pool.size", "6G") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "16g") \
    .getOrCreate()

In [31]:
hdfs_path = "hdfs://localhost:9009/"
book_reviews = spark.read.parquet(f"{hdfs_path}golden/sampled_review_info_embedded")
songs = spark.read.parquet(f"{hdfs_path}golden/song_info_embedded")
songs.show(10)
book_reviews.show(10)

                                                                                

+--------------------+-------------------+-------+------------------+--------------------+-----+--------+------+------------+------------+-----------+--------+------------+----------------+--------------------+--------+
|              artist|               song|emotion|          variance|               Genre|Tempo|Loudness|Energy|Danceability|Positiveness|Speechiness|Liveness|Acousticness|Instrumentalness|     embedded_vector|faiss_id|
+--------------------+-------------------+-------+------------------+--------------------+-----+--------+------+------------+------------+-----------+--------+------------+----------------+--------------------+--------+
|  (Alla Pugacheva...|Позови меня P.I.M.P|  anger|0.8335142456515902|Unknown,Unknown,U...|  175|   -7.15|    91|          42|          32|          8|      38|           0|              84|[-0.023154184, 0....|       0|
|         (Ani Lorak)|         Shady Lady|    joy|0.8335142456515902|Unknown,Unknown,U...|  128|   -13.0|    70|        

[Stage 23:>                                                         (0 + 1) / 1]

+--------+--------------------+--------------------+------------+
| book_id|             user_id|text_embedded_vector|    faiss_id|
+--------+--------------------+--------------------+------------+
|23341863|81ec8d766fc3a5ebd...|[0.036919385, -0....|523986010112|
|17315867|81ec8d766fc3a5ebd...|[-0.07486253, 0.0...|523986010113|
|18165974|81ec8d766fc3a5ebd...|[-0.044150714, -0...|523986010114|
|17345242|81ec8d766fc3a5ebd...|[-0.019383084, 0....|523986010115|
|15838920|81ec8d766fc3a5ebd...|[0.06310062, -0.0...|523986010116|
| 7940988|81ec8d766fc3a5ebd...|[-0.036082026, 0....|523986010117|
| 1162543|81ec8d766fc3a5ebd...|[0.022156745, -0....|523986010118|
|10382150|81ec8d766fc3a5ebd...|[0.014546832, -0....|523986010119|
|  818056|81ec8d766fc3a5ebd...|[-0.067639105, -4...|523986010120|
|15790895|81ec8d766fc3a5ebd...|[-0.016151592, -0...|523986010121|
+--------+--------------------+--------------------+------------+
only showing top 10 rows



                                                                                

In [32]:
import faiss
import numpy as np
import os
h_path = os.path.expanduser("~/silver_layer/")
song_index = faiss.read_index(f"{h_path}faiss_song_index.index")

In [33]:
# faiss_id -> song_id
song_faiss_map = songs.select("faiss_id", "song").rdd.collectAsMap()

# take embeddings and info from sampled reviews
review_rows = book_reviews.select("user_id", "book_id", "text_embedded_vector").collect()

25/05/06 17:15:04 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
                                                                                

In [37]:
results = []
batch_size = 10000

for i in range(0, len(review_rows), batch_size):
    batch = review_rows[i:i+batch_size]
    
    user_ids = [row["user_id"] for row in batch]
    book_ids = [row["book_id"] for row in batch]

    vectors = np.array([row["text_embedded_vector"] for row in batch], dtype=np.float32)
    
    vectors /= np.linalg.norm(vectors, axis=1, keepdims=True)  # normalize tuta
    
    D, I = song_index.search(vectors, 1)
    
    for j in range(len(batch)):
        matched_faiss_id = I[j][0]
        matched_song_id = song_faiss_map.get(matched_faiss_id)
        if matched_song_id:
            results.append((user_ids[j], book_ids[j], matched_song_id))


In [39]:
# result DataFrame
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("book_id", StringType(), True),
    StructField("song_id", StringType(), True)
])

matched_df = spark.createDataFrame(results, schema)
matched_df.count()
matched_df.show(truncate=False)



+--------------------------------+--------+---------------------------------------------+
|user_id                         |book_id |song_id                                      |
+--------------------------------+--------+---------------------------------------------+
|81ec8d766fc3a5ebdc900d7d89f93ab6|18165974|Set in Stone                                 |
|81ec8d766fc3a5ebdc900d7d89f93ab6|17345242|Tired                                        |
|81ec8d766fc3a5ebdc900d7d89f93ab6|15838920|323 Go Crazy                                 |
|81ec8d766fc3a5ebdc900d7d89f93ab6|7940988 |Just Friends                                 |
|81ec8d766fc3a5ebdc900d7d89f93ab6|818056  |Figure It Out                                |
|81ec8d766fc3a5ebdc900d7d89f93ab6|9998705 |Finally Home                                 |
|81ec8d766fc3a5ebdc900d7d89f93ab6|104378  |Speakeasy                                    |
|81ec8d766fc3a5ebdc900d7d89f93ab6|1480129 |Blue Trail Of Sorrow                         |
|6464c2c43

                                                                                

In [41]:
matched_df.write.mode("overwrite").parquet(f"{hdfs_path}golden/matched_user_book_song.parquet")

                                                                                