# NÊN SỬA LẠI INPUT LÀ USER VECTOR, OUTPUT LÀ CÁC ITEM TƯƠNG TỰ

In [2]:
import os
import sys
import pyspark

# Set PYSPARK_PYTHON to the current Python executable
os.environ["PYSPARK_PYTHON"] = sys.executable
print("PYSPARK_PYTHON:", os.environ["PYSPARK_PYTHON"])
print("PySpark version:", pyspark.__version__)


from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import when, col, lit, isnan, sum
from pyspark.ml.recommendation import ALS
import itertools
import math
import pandas as pd


spark = (
    SparkSession.builder
    .appName("LSH")
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()
)


PYSPARK_PYTHON: c:\utecode\bdp\aws-review23\.venv\Scripts\python.exe
PySpark version: 3.5.6


In [3]:
# Load data
item_factors = spark.read.parquet("embeddings/item_factors.parquet")
user_factors = spark.read.parquet("embeddings/user_factors.parquet")
item_index = spark.read.parquet("mappings/item_index.parquet")
user_index = spark.read.parquet("mappings/user_index.parquet")

# Show first 3 rows sorted by id
item_factors.show(3, truncate=False)
user_factors.show(3, truncate=False)
item_index.show(3, truncate=False)
user_index.show(3, truncate=False)

# Print schema
item_factors.printSchema()


+---+-------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                         |
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |[4.9745788E-5, -1.2412119E-4, -1.6477374E-5, -9.531968E-5, -1.6796927E-5, 5.582382E-5, -5.2143587E-5, -8.3885425E-5, -7.837608E-6, -2.3614797E-5]|
|10 |[0.0037201846, -4.6003016E-4, -0.0030187091, 0.002095444, 0.0017704803, -0.001103599, -5.1323477E-5, -0.0028090628, 9.320244E-4, 0.001393206]    |
|20 |[-9.2256545E-5, 8.281473E-5, -1.6367284E-4, 5.301488E-4, -3.2991575E-5, -2.7330144E-4, 2.9878456E-5, 1.950615E-4, 1.4138107E-4, -3.117971E-4]    |
+---+-----------------------------------------------------------------------------------

In [4]:
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col

# --- Chuẩn bị vector dạng dense ---
to_vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
item_vecs = item_factors.withColumn("features_vec", to_vector_udf("features"))
user_vecs = user_factors.withColumn("features_vec", to_vector_udf("features"))


item_vecs.show(3)
user_vecs.show(3)
print(item_vecs.count())
print(user_vecs.count())

item_vecs.printSchema()
user_vecs.printSchema()



+---+--------------------+--------------------+
| id|            features|        features_vec|
+---+--------------------+--------------------+
|  0|[4.9745788E-5, -1...|[4.97457876917906...|
| 10|[0.0037201846, -4...|[0.00372018455527...|
| 20|[-9.2256545E-5, 8...|[-9.2256545030977...|
+---+--------------------+--------------------+
only showing top 3 rows

+---+--------------------+--------------------+
| id|            features|        features_vec|
+---+--------------------+--------------------+
|  0|[-2.554666E-6, 9....|[-2.5546660253894...|
| 10|[4.8176673E-5, -9...|[4.81766728626098...|
| 20|[-1.1695193E-4, 2...|[-1.1695193097693...|
+---+--------------------+--------------------+
only showing top 3 rows

78947
182663
root
 |-- id: integer (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- features_vec: vector (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- features: array (nullable = true)
 |    |-- eleme

In [5]:
# --- Gắn asin thật vào embedding ---
item_vecs = item_vecs.join(item_index, item_vecs.id == item_index.itemIndex, "inner").select("id", "parent_asin", "features_vec")
user_vecs = user_vecs.join(user_index, user_vecs.id == user_index.userIndex, "inner").select("id", "user_id", "features_vec")




In [6]:
item_vecs.show(3)
user_vecs.show(3, truncate=False)


+---+-----------+--------------------+
| id|parent_asin|        features_vec|
+---+-----------+--------------------+
|  0| 0307449440|[4.97457876917906...|
| 10| 048682862X|[0.00372018455527...|
| 20| 0764350226|[-9.2256545030977...|
+---+-----------+--------------------+
only showing top 3 rows

+---+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |user_id                     |features_vec                                                                                                                                                                                                                 |
+---+----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------

## Thêm thông tin sản phẩm vào items

In [7]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

# Read each line as plain text
raw_df = spark.read.text("meta_Arts_Crafts_and_Sewing.jsonl.gz")

# Define schema — skip problematic keys
# Keep only relevant top-level fields
schema = StructType([
    StructField("parent_asin", StringType()),
    StructField("title", StringType()),
    StructField("main_category", StringType()),
    StructField("store", StringType()),
])


# Parse JSON manually
products_df = raw_df.select(from_json(col("value"), schema).alias("data")).select("data.*")
products_df.printSchema()


root
 |-- parent_asin: string (nullable = true)
 |-- title: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- store: string (nullable = true)



In [8]:

item_vecs_enriched = (
    item_vecs
    .join(products_df.select("parent_asin", "title", "main_category", "store"), on="parent_asin", how="left")
)

In [9]:
user_vecs.cache()
item_vecs_enriched.cache()

DataFrame[parent_asin: string, id: int, features_vec: vector, title: string, main_category: string, store: string]

In [10]:
item_vecs_enriched.show(3, truncate=False)
print(f"Number of rows in item_vecs_enriched: {item_vecs_enriched.count()}")

+-----------+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------------------------------------------+
|parent_asin|id |features_vec                                                                                                                                                                                                                   |title                                                                                                                                                                       |main_category|store                                          |
+-----------+---+-----------------------------

In [11]:
# item_vecs có cột: id, parent_asin, features_vec
brp = BucketedRandomProjectionLSH(
    inputCol="features_vec",
    outputCol="hashes",
    bucketLength=2,
    numHashTables=3
)

# Fit model
lsh_model = brp.fit(item_vecs_enriched)


In [12]:

# Transform item vectors để tạo cột hashes
item_hashed = lsh_model.transform(item_vecs_enriched)
item_hashed.show(3, truncate=False)
item_hashed.count()

+-----------+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------------------------------------------+------------------------+
|parent_asin|id |features_vec                                                                                                                                                                                                                   |title                                                                                                                                                                       |main_category|store                                          |hashes                

78947

In [13]:
def get_vector_by_id(df, id_col, target_id):
    """
    Get the features vector for a given user_id or item_id from a PySpark DataFrame.

    Args:
        df (DataFrame): PySpark DataFrame containing 'features_vec' column.
        id_col (str): Column name for ID ('user_id' or 'parent_asin').
        target_id (str): The ID value to look up.

    Returns:
        vector (pyspark.ml.linalg.Vector) or None if ID not found
    """
    row = df.filter(df[id_col] == target_id).select("features_vec").first()
    
    if row:
        return row["features_vec"]
    else:
        print(f"{id_col} '{target_id}' not found.")
        return None



In [14]:
# For a user
user_vector = get_vector_by_id(user_vecs, "user_id", "AE22236AFRRSMQIKGG7TPTB75QEA")
print(user_vector)

# For an item
item_vector = get_vector_by_id(item_vecs, "parent_asin", "0307449440")
print(item_vector)


[-2.5546660253894515e-06,9.937423783412669e-06,4.635908680938883e-06,-2.1633726646541618e-05,-1.0289136298524681e-05,1.0973762982757762e-06,3.737606220965972e-06,1.9055776647292078e-05,-8.370523573830724e-06,-4.855136285186745e-05]
[4.974578769179061e-05,-0.00012412118667270988,-1.6477373719681054e-05,-9.531967953080311e-05,-1.6796926502138376e-05,5.5823820730438456e-05,-5.214358679950237e-05,-8.388542482862249e-05,-7.837607881810982e-06,-2.3614797100890428e-05]


In [15]:
# tìm 5 items gần user_vector
k = 5
nearest_user_items = lsh_model.approxNearestNeighbors(item_hashed, user_vector, numNearestNeighbors=k, distCol="EuclideanDistance")
nearest_item_items = lsh_model.approxNearestNeighbors(item_hashed, item_vector, numNearestNeighbors=k, distCol="EuclideanDistance")


In [16]:
nearest_user_items.show(k)


+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|parent_asin|   id|        features_vec|               title|       main_category|               store|              hashes|   EuclideanDistance|
+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| B073J5S95M|35663|[-3.5969478631159...|Hotop 120 Set Lea...|Arts, Crafts & Se...|               Hotop|[[0.0], [0.0], [0...|2.553734306223562...|
| B07Y6QT9JS|55646|[-1.0403470696473...|19 Gauge Cotton C...|Industrial & Scie...|Humboldt Haberdas...|[[0.0], [0.0], [0...| 2.62297338951367E-5|
| 1892214997|  400|[-1.4254364032240...|How To Carve Leather|               Books|Al Stohlman (Author)|[[0.0], [0.0], [0...|2.847312447239994...|
| B07CM9RTZ3|42016|[-9.3778162408852...|Norberg & Linden ...|     Office Products|    Norberg & Linden|[[-1.0], [0.0], [...|

In [17]:
nearest_item_items.show(k)

+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|parent_asin|   id|        features_vec|               title|       main_category|               store|              hashes|   EuclideanDistance|
+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 0307449440|    0|[4.97457876917906...|Socks from the To...|               Books|Wendy D. Johnson ...|[[0.0], [0.0], [-...|                 0.0|
| B008ROX1E8|13748|[2.64627451542764...|Metal Earth Light...|        Toys & Games|         Metal Earth|[[0.0], [0.0], [-...| 7.90120134715847E-5|
| B0BK7T8C2M|85002|[2.13108633033698...|Ergonomic Crochet...|Arts, Crafts & Se...|              Lewhoo|[[0.0], [0.0], [-...|9.429850156483452E-5|
| B01HPRVDK6|29386|[1.14014619612134...|SAKURA Cray-Pas S...|Arts, Crafts & Se...|              Sakura|[[0.0], [0.0], [-...|

In [20]:
import time

k = 5

# --- đo thời gian tìm items gần user_vector ---
start = time.time()
nearest_user_items = lsh_model.approxNearestNeighbors(
    item_hashed, user_vector, numNearestNeighbors=k, distCol="EuclideanDistance"
)
nearest_user_items.show(k)
end = time.time()
print(f"Time to find {k} nearest items for user_vector: {end - start:.4f} seconds")


# --- đo thời gian tìm items gần item_vector ---
start = time.time()
nearest_item_items = lsh_model.approxNearestNeighbors(
    item_hashed, item_vector, numNearestNeighbors=k, distCol="EuclideanDistance"
)
nearest_item_items.show(k)
end = time.time()
print(f"Time to find {k} nearest items for item_vector: {end - start:.4f} seconds")



+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|parent_asin|   id|        features_vec|               title|       main_category|               store|              hashes|   EuclideanDistance|
+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| B073J5S95M|35663|[-3.5969478631159...|Hotop 120 Set Lea...|Arts, Crafts & Se...|               Hotop|[[0.0], [0.0], [0...|2.553734306223562...|
| B07Y6QT9JS|55646|[-1.0403470696473...|19 Gauge Cotton C...|Industrial & Scie...|Humboldt Haberdas...|[[0.0], [0.0], [0...| 2.62297338951367E-5|
| 1892214997|  400|[-1.4254364032240...|How To Carve Leather|               Books|Al Stohlman (Author)|[[0.0], [0.0], [0...|2.847312447239994...|
| B07CM9RTZ3|42016|[-9.3778162408852...|Norberg & Linden ...|     Office Products|    Norberg & Linden|[[-1.0], [0.0], [...|