In [52]:
from pyspark.sql import SparkSession  


spark = SparkSession.builder \
    .master("local[4]") \
    .appName("golden") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.cores", "8") \
    .config("spark.task.cpus", "4") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.rapids.memory.gpu.pool.size", "12G") \
    .getOrCreate()

In [53]:
from sentence_transformers import SentenceTransformer

import torch
from pyspark.sql.functions import pandas_udf
import pandas as pd
from setuptools._distutils import *


device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)



@pandas_udf("array<float>")
def get_embeddings_udf(title_series: pd.Series) -> pd.Series:
    return pd.Series(model.encode(title_series.tolist(), batch_size=2048).tolist())

cuda


In [54]:
@pandas_udf("array<float>")
def get_weighted_embeddings_udf(title: pd.Series, genre: pd.Series, emotion: pd.Series) -> pd.Series:
    title_embeddings = model.encode(title.fillna("").tolist(), batch_size=64)
    genre_embeddings = model.encode(genre.fillna("").tolist(), batch_size=64)
    emotion_embeddings = model.encode(emotion.fillna("").tolist(), batch_size=64)
    
    # Define weights
    w_title = 0.3
    w_genre = 0.2
    w_emotion = 0.5

    # Weighted sum
    combined = [
        (w_title * t + w_genre * g + w_emotion * e).tolist()
        for t, g, e in zip(title_embeddings, genre_embeddings, emotion_embeddings)
    ]
    return pd.Series(combined)

In [None]:
import os
from pyspark.sql.functions import monotonically_increasing_id

data_path = "file:///home/hd/golden_layer"
songs_df = spark.read.parquet(f"{data_path}/song_info").cache()
songs_df.count()
songs_df.show(3)




25/05/05 21:54:02 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
                                                                                

+------+--------------------+-------+-------------------+-----+-----+--------+------+------------+------------+-----------+--------+------------+----------------+
|artist|                song|emotion|           variance|Genre|Tempo|Loudness|Energy|Danceability|Positiveness|Speechiness|Liveness|Acousticness|Instrumentalness|
+------+--------------------+-------+-------------------+-----+-----+--------+------+------------+------------+-----------+--------+------------+----------------+
|  ABBA|She's My Kind Of ...|    joy| 0.4476190476190476|  pop|  128|    -6.0|    78|          56|          60|          3|      31|           7|               0|
|  ABBA|    Andante, Andante|   love|0.20222222222222216|  pop|  102|  -10.72|    36|          52|          38|          2|       7|          68|               0|
|  ABBA|      As Good As New|sadness| 0.3008807588075881|  pop|  139|    -5.7|    78|          85|          97|          3|       8|          20|               2|
+------+--------------

In [60]:
EMD_COL_NAME = "embedded_vector"
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

window = Window.orderBy("artist") 
songs_df_embedded = songs_df \
    .withColumn(
        EMD_COL_NAME, get_weighted_embeddings_udf(
            songs_df["song"], 
            songs_df["genre"],
            songs_df["emotion"]
        )
    ) \
    .withColumn(
        "faiss_id", row_number().over(window) - 1
    )
songs_df_embedded.show(10, truncate=False)

25/05/05 22:04:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:04:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:04:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+----------------------------+-------------------+-------+------------------+-----------------------+-----+--------+------+------------+------------+-----------+--------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [61]:
songs_df_embedded.orderBy("Acousticness").show(1000)

25/05/05 22:05:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:05:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:05:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:07:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:07:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

+--------------------+--------------------+--------+--------------------+--------------------+-----+--------+------+------------+------------+-----------+--------+------------+----------------+--------------------+--------+
|              artist|                song| emotion|            variance|               Genre|Tempo|Loudness|Energy|Danceability|Positiveness|Speechiness|Liveness|Acousticness|Instrumentalness|     embedded_vector|faiss_id|
+--------------------+--------------------+--------+--------------------+--------------------+-----+--------+------+------------+------------+-----------+--------+------------+----------------+--------------------+--------+
|                 AFI|         Miss Murder|    love|  0.8335142456515902|         alternative|  144|   -4.54|    87|          31|          69|          5|      10|           0|               0|[-0.07694392, 0.0...|    4235|
|              Accept|   Balls to the Wall|   anger|  0.8335142456515902|               metal|  118|   -

In [62]:


songs_df_embedded.write.mode("overwrite").parquet(f"{data_path}/song_info_embedded")


25/05/05 22:07:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:07:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:07:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:09:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/05 22:09:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [12]:
songs_df_test=spark.read.parquet(f"{data_path}/song_info_embedded")
songs_df_test=songs_df_test.orderBy(songs_df_test["song"])
songs_df_test.show(truncate=False)
original_count = songs_df_embedded.count()

# Прочитайте збережені дані назад
saved_count = songs_df_test.count()

print(f"Оригінальних рядків: {original_count}")
print(f"Збережених рядків: {saved_count}")
print(f"Дані {'повні' if original_count == saved_count else 'неповні'}!")





+-----------------------+--------------------------------+-------+------------------+-----------------------+-----+--------+------+------------+------------+-----------+--------+------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [None]:
import os
from pyspark.sql.functions import monotonically_increasing_id

data_path = "file:///home/hd/golden_layer"
review_df = spark.read.parquet(f"{data_path}/rewiew_info/rewiew_info").sample(withReplacement=False, fraction=0.07, seed=42).cache()
review_df.show(10)



25/05/05 21:16:10 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/05/05 21:16:12 WARN MemoryStore: Not enough space to cache rdd_6_3 in memory! (computed 65.0 MiB so far)
25/05/05 21:16:13 WARN MemoryStore: Not enough space to cache rdd_6_4 in memory! (computed 64.9 MiB so far)
25/05/05 21:16:13 WARN MemoryStore: Not enough space to cache rdd_6_5 in memory! (computed 65.5 MiB so far)
25/05/05 21:16:14 WARN MemoryStore: Not enough space to cache rdd_6_6 in memory! (computed 65.3 MiB so far)
25/05/05 21:16:15 WARN MemoryStore: Not enough space to cache rdd_6_8 in memory! (computed 65.1 MiB so far)
25/05/05 21:16:17 WARN MemoryStore: Not enough space to cache rdd_6_10 in memory! (computed 65.2 MiB so far)
25/05/05 21:16:18 WARN MemoryStore: Not enough space to cache rdd_6_12 in memory! (computed 64.9 MiB so far)
25/05/05 21:16:19 WARN Memor

+--------+--------------------+--------------------+
| book_id|         review_text|             user_id|
+--------+--------------------+--------------------+
|31290539|I'm getting so ex...|16c6b7c60483be209...|
|30788942|*Full Review* \n ...|16c6b7c60483be209...|
|11832081|Cute, quick read....|16c6b7c60483be209...|
|22393715|*Full Review Now*...|16c6b7c60483be209...|
|29070867|I have been debat...|16c6b7c60483be209...|
|29535971|Such a sweet stor...|16c6b7c60483be209...|
|20671912|I started this qu...|16c6b7c60483be209...|
|22779154|"With Every Breat...|16c6b7c60483be209...|
|27503614|A fun romantic no...|16c6b7c60483be209...|
| 6335685|I really enjoyed ...|16c6b7c60483be209...|
+--------+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [46]:
print(review_df.count())



1101910


                                                                                

In [47]:
EMD_COL_NAME = "text_embedded_vector"


review_df_embedded = review_df \
    .withColumn(
        EMD_COL_NAME, get_embeddings_udf(review_df['review_text'])
    ) \
    .withColumn(
        "faiss_id", monotonically_increasing_id() 
    ).drop(review_df['review_text'])
review_df_embedded.show(10, truncate=False)

[Stage 22:>                                                         (0 + 1) / 1]

+--------+--------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [48]:

review_df_embedded.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- text_embedded_vector: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- faiss_id: long (nullable = false)



In [49]:
review_df_embedded.write.mode("overwrite").parquet(f"{data_path}/sampled_review_info_embedded")

                                                                                

In [None]:
spark.stop()