In [None]:
# HERE ARE JUST GENERAL FUNCTIONS THAT WORK AND YOU CAN USE THEM IN YOUR CODE

In [1]:
# PySpark sessions for CPU and GPU

from pyspark.sql import SparkSession

# for CPU
"""
spark = SparkSession.builder \
	.master("local").appName("hdfs_test")\
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9009") \
    .getOrCreate()
"""

# for GPU

#"""
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("hdfs_test") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.task.cpus", "4") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.rapids.memory.gpu.pool.size", "2G") \
    .getOrCreate()
#"""

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/03 21:14:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# CALCULATE EMBEDDINGS

# load the model only once

from sentence_transformers import SentenceTransformer
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf("array<float>")
def get_embeddings_udf(title_series: pd.Series) -> pd.Series:
    return pd.Series(model.encode(title_series.tolist(), batch_size=64).tolist())


# example of usage
# df = df.withColumn("embedding", get_embeddings_udf(df["title"]))

# with batch and cuda it will be up to 1k times quicker

# make sure you install requirements.txt

2025-05-03 21:15:05.994958: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 21:15:05.995006: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 21:15:05.995861: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 21:15:06.000828: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
25/05/03 21:15:08 WARN SQLConf: The SQL config 'spark

In [3]:
# EXAMPLE OF HOW IT IS RUN

import os

silver_layer_path = "file:///" + os.path.join(os.getcwd(), "silver_layer")
genres_roi_loaded = spark.read.parquet(f"{silver_layer_path}/genres/genres")
genres_roi_loaded.show()

genres_with_embeddings = genres_roi_loaded.withColumn("embedding", get_embeddings_udf(genres_roi_loaded["genre"]))
genres_with_embeddings.show()

+---------------+------------------+-----------+--------+
|          genre|       average_roi|movie_count|genre_id|
+---------------+------------------+-----------+--------+
|         Horror| 94855.14997248157|       1065|       0|
|       TV Movie| 49215.98103159122|         21|       1|
|        Mystery|1302.9860563666039|        774|       2|
|         Comedy|1183.8108191182107|       3397|       3|
|          Drama| 629.6503820668494|       4476|       4|
|        Romance| 620.3787291653363|       1750|       5|
|         Family|154.58040832077367|        918|       6|
|    Documentary| 49.30017380135712|        192|       7|
|      Adventure|15.809311218959344|       1531|       8|
|         Action| 10.67295903929723|       2375|       9|
|          Crime| 9.468052719953409|       1389|      10|
|        Fantasy|   9.0084334325443|        858|      11|
|          Music| 8.623972214554092|        362|      12|
|       Thriller| 5.422435835286275|       2226|      13|
|Science Ficti

2025-05-03 21:15:20.549986: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-03 21:15:20.550033: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-03 21:15:20.550868: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 21:15:20.555730: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


+---------------+------------------+-----------+--------+--------------------+
|          genre|       average_roi|movie_count|genre_id|           embedding|
+---------------+------------------+-----------+--------+--------------------+
|         Horror| 94855.14997248157|       1065|       0|[-0.022796463, 0....|
|       TV Movie| 49215.98103159122|         21|       1|[-0.1153389, -9.2...|
|        Mystery|1302.9860563666039|        774|       2|[-0.06938475, 0.0...|
|         Comedy|1183.8108191182107|       3397|       3|[-0.04261127, -0....|
|          Drama| 629.6503820668494|       4476|       4|[0.011846957, -0....|
|        Romance| 620.3787291653363|       1750|       5|[-0.029753814, 3....|
|         Family|154.58040832077367|        918|       6|[-0.09010321, 0.0...|
|    Documentary| 49.30017380135712|        192|       7|[-0.08694082, 0.0...|
|      Adventure|15.809311218959344|       1531|       8|[0.0013455322, 0....|
|         Action| 10.67295903929723|       2375|    

                                                                                