In [None]:
# HERE ARE JUST GENERAL FUNCTIONS THAT WORK AND YOU CAN USE THEM IN YOUR CODE

In [1]:
# PySpark sessions for CPU and GPU

from pyspark.sql import SparkSession

# for CPU
"""
spark = SparkSession.builder \
	.master("local").appName("hdfs_test")\
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9009") \
    .getOrCreate()
"""

# for GPU

#"""
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("hdfs_test") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.task.cpus", "4") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.rapids.memory.gpu.pool.size", "2G") \
    .getOrCreate()
#"""

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/03 21:14:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# CALCULATE EMBEDDINGS

# load the model only once

from sentence_transformers import SentenceTransformer
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf("array<float>")
def get_embeddings_udf(title_series: pd.Series) -> pd.Series:
    return pd.Series(model.encode(title_series.tolist(), batch_size=64).tolist())


# example of usage
# df = df.withColumn("embedding", get_embeddings_udf(df["title"]))

# with batch and cuda it will be up to 1k times quicker

# make sure you install requirements.txt

2025-05-03 21:15:05.994958: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 21:15:05.995006: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 21:15:05.995861: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 21:15:06.000828: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
25/05/03 21:15:08 WARN SQLConf: The SQL config 'spark

In [5]:
# CALCULATE EMBEDDINGS FOR FEW COLUMNS WITH WEIGHTEM SUM COMBINATION (I.E. GENRE CARRIES MORE MEANING THAN TITLE FOR OUR CASE => IT GETS HIGHER PRIORITY)

@pandas_udf("array<float>")
def get_weighted_embeddings_udf(title: pd.Series, genre: pd.Series, emotion: pd.Series) -> pd.Series:
    title_embeddings = model.encode(title.fillna("").tolist(), batch_size=64)
    genre_embeddings = model.encode(genre.fillna("").tolist(), batch_size=64)
    emotion_embeddings = model.encode(emotion.fillna("").tolist(), batch_size=64)
    
    # Define weights
    w_title = 0.3
    w_genre = 0.2
    w_emotion = 0.5

    # Weighted sum
    combined = [
        (w_title * t + w_genre * g + w_emotion * e).tolist()
        for t, g, e in zip(title_embeddings, genre_embeddings, emotion_embeddings)
    ]
    return pd.Series(combined)


In [9]:
# EXAMPLE OF HOW IT IS RUN

import os

silver_layer_path = "file:///" + os.path.join(os.getcwd(), "silver_layer")
genres_roi_loaded = spark.read.parquet(f"{silver_layer_path}/genres/genres")
genres_roi_loaded.show(3)

# EMBEDDING FOR 1 COL
genres_with_embedding_from_1_col = genres_roi_loaded.withColumn(
    "embedding", get_embeddings_udf(genres_roi_loaded["genre"])
)
genres_with_embedding_from_1_col.show(3, truncate=False)

# EMBEDDING FOR WEIGHTED FEW COLS
genres_with_embedding_from_multi_col = genres_roi_loaded.withColumn(
    "weighted_multi_embedding", get_weighted_embeddings_udf(
        genres_roi_loaded["genre"], #make sure to change to your column namers (different, here it is same only for example)
        genres_roi_loaded["genre"],
        genres_roi_loaded["genre"]
    )
)
genres_with_embedding_from_multi_col.show(3, truncate=False)

+--------+------------------+-----------+--------+
|   genre|       average_roi|movie_count|genre_id|
+--------+------------------+-----------+--------+
|  Horror| 94855.14997248157|       1065|       0|
|TV Movie| 49215.98103159122|         21|       1|
| Mystery|1302.9860563666039|        774|       2|
+--------+------------------+-----------+--------+
only showing top 3 rows



2025-05-03 21:25:05.117481: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 21:25:05.117523: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 21:25:05.118390: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 21:25:05.123435: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
                                                     

+--------+------------------+-----------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

2025-05-03 21:25:10.525134: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 21:25:10.525172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 21:25:10.526047: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 21:25:10.531105: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


+--------+------------------+-----------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                