In [1]:
# HERE ARE JUST GENERAL FUNCTIONS THAT WORK AND YOU CAN USE THEM IN YOUR CODE

In [2]:
# PySpark sessions for CPU and GPU

from pyspark.sql import SparkSession

# for CPU
"""
spark = SparkSession.builder \
	.master("local").appName("hdfs_test")\
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9009") \
    .getOrCreate()
"""

# for GPU

#"""
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("hdfs_test") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.task.cpus", "4") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.rapids.memory.gpu.pool.size", "2G") \
    .getOrCreate()
#"""

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/03 22:08:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# CALCULATE EMBEDDINGS

# load the model only once

from sentence_transformers import SentenceTransformer
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf("array<float>")
def get_embeddings_udf(title_series: pd.Series) -> pd.Series:
    return pd.Series(model.encode(title_series.tolist(), batch_size=64).tolist())


# example of usage
# df = df.withColumn("embedding", get_embeddings_udf(df["title"]))

# with batch and cuda it will be up to 1k times quicker

# make sure you install requirements.txt

2025-05-03 22:08:07.824890: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 22:08:07.824938: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 22:08:07.825848: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 22:08:07.831252: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
25/05/03 22:08:10 WARN SQLConf: The SQL config 'spark

In [4]:
# CALCULATE EMBEDDINGS FOR FEW COLUMNS WITH WEIGHTEM SUM COMBINATION (I.E. GENRE CARRIES MORE MEANING THAN TITLE FOR OUR CASE => IT GETS HIGHER PRIORITY)

@pandas_udf("array<float>")
def get_weighted_embeddings_udf(title: pd.Series, genre: pd.Series, emotion: pd.Series) -> pd.Series:
    title_embeddings = model.encode(title.fillna("").tolist(), batch_size=64)
    genre_embeddings = model.encode(genre.fillna("").tolist(), batch_size=64)
    emotion_embeddings = model.encode(emotion.fillna("").tolist(), batch_size=64)
    
    # Define weights
    w_title = 0.3
    w_genre = 0.2
    w_emotion = 0.5

    # Weighted sum
    combined = [
        (w_title * t + w_genre * g + w_emotion * e).tolist()
        for t, g, e in zip(title_embeddings, genre_embeddings, emotion_embeddings)
    ]
    return pd.Series(combined)


In [5]:
# EXAMPLE OF HOW IT IS RUN

import os
from pyspark.sql.functions import monotonically_increasing_id

silver_layer_path = "file:///" + os.path.join(os.getcwd(), "silver_layer")
genres_roi_loaded = spark.read.parquet(f"{silver_layer_path}/genres/genres")
genres_roi_loaded.show(3)

# EMBEDDING FOR 1 COL
genres_with_embedding_from_1_col = genres_roi_loaded.withColumn(
    "embedding", get_embeddings_udf(genres_roi_loaded["genre"])
) \
.withColumn(
    "faiss_id", monotonically_increasing_id() # to map FAISS BACK
)


genres_with_embedding_from_1_col.show(3, truncate=False)

# EMBEDDING FOR WEIGHTED FEW COLS
genres_with_embedding_from_multi_col = genres_roi_loaded \
    .withColumn(
        "weighted_multi_embedding", get_weighted_embeddings_udf(
            genres_roi_loaded["genre"], #make sure to change to your column namers (different, here it is same only for example)
            genres_roi_loaded["genre"],
            genres_roi_loaded["genre"]
        )
    ) \
    .withColumn(
        "faiss_id", monotonically_increasing_id() # to map FAISS BACK
    )
genres_with_embedding_from_multi_col.show(3, truncate=False)

+--------+------------------+-----------+--------+
|   genre|       average_roi|movie_count|genre_id|
+--------+------------------+-----------+--------+
|  Horror| 94855.14997248157|       1065|       0|
|TV Movie| 49215.98103159122|         21|       1|
| Mystery|1302.9860563666039|        774|       2|
+--------+------------------+-----------+--------+
only showing top 3 rows



2025-05-03 22:08:19.629903: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 22:08:19.629951: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 22:08:19.630876: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 22:08:19.635886: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
                                                     

+--------+------------------+-----------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

2025-05-03 22:08:25.745049: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 22:08:25.745098: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 22:08:25.745935: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 22:08:25.750941: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


+--------+------------------+-----------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [6]:
# CONVERT TO LIST OF VECTORS TO STORE IN FAISS

EMD_COL_NAME = "weighted_multi_embedding"

embedding_rows = genres_with_embedding_from_multi_col.select(EMD_COL_NAME).collect()
embedding_vectors = [row[EMD_COL_NAME] for row in embedding_rows]

2025-05-03 22:08:31.441734: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 22:08:31.441776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 22:08:31.442626: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 22:08:31.447555: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
                                                     

In [7]:
# STORE IN FAISS (FOR PRODUCTION-READY QUICK SEARCH AND RETRIEVAL)

import faiss
import numpy as np

# Normalize vectors to unit length (to emulate cosine distance, unsupported by FAISS but needed for us)
def normalize(vectors):
    return vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

# Convert to NumPy array (float32, required by FAISS)
embedding_matrix = np.array(embedding_vectors).astype("float32")

normalized_vectors = normalize(embedding_matrix)

# Use inner product for cosine similarity
index = faiss.IndexFlatIP(normalized_vectors.shape[1])
index.add(normalized_vectors)

print(f"FAISS index created with {index.ntotal} vectors.")

FAISS index created with 19 vectors.


In [8]:
# SAVE PERMANENTLY / LOAD FAISS

# save
faiss.write_index(index, "faiss_genre_index.index")
print("FAISS index saved to 'faiss_genre_index.index'")

# load
index = faiss.read_index("faiss_genre_index.index")

FAISS index saved to 'faiss_genre_index.index'


In [10]:
# SEARCH WITH FAISS

from pyspark.sql.functions import col

query_vector = embedding_matrix[0].reshape(1, -1) # replace with embedding for new book for example
normalized_query = normalize(query_vector)

k = 5  # number of nearest neighbors - top N songs to find
distances, indices = index.search(normalized_query.astype(np.float32), k)

print("Top 5 nearest neighbors:", indices)


# LOAD BACK ROWS
def select_entries_by_row_id(row_ids, genres_with_embedding_from_multi_col):
    """
    Select rows from the dataframe based on the provided list of row_ids.

    Parameters:
    - row_ids: A list of row_ids to filter by.
    - genres_with_embedding_from_multi_col: DataFrame to select from.

    Returns:
    - A DataFrame containing the rows that match the row_ids.
    """
    # Convert the list of row_ids to a set for fast lookup
    row_id_set = set(row_ids)

    # Filter the DataFrame to select rows where row_id is in the given set
    selected_entries = genres_with_embedding_from_multi_col.filter(col('faiss_id').isin(row_id_set))

    return selected_entries

[indices] = indices
selected_entries_df = select_entries_by_row_id(indices, genres_with_embedding_from_multi_col)

selected_entries_df.show()

Top 5 nearest neighbors: [[ 0 13  2  8  3]]


+---------+------------------+-----------+--------+------------------------+--------+
|    genre|       average_roi|movie_count|genre_id|weighted_multi_embedding|faiss_id|
+---------+------------------+-----------+--------+------------------------+--------+
|   Horror| 94855.14997248157|       1065|       0|    [-0.022796463, 0....|       0|
|  Mystery|1302.9860563666039|        774|       2|    [-0.06938475, 0.0...|       2|
|   Comedy|1183.8108191182107|       3397|       3|    [-0.04261127, -0....|       3|
|Adventure|15.809311218959344|       1531|       8|    [0.0013455322, 0....|       8|
| Thriller| 5.422435835286275|       2226|      13|    [-0.0518381, -0.0...|      13|
+---------+------------------+-----------+--------+------------------------+--------+

