In [18]:
from pyspark.sql.functions import col, abs, sum as spark_sum, pow, sqrt, udf, expr
from pyspark.sql.types import FloatType
import numpy as np
from pyspark.sql import functions as F
import builtins
from pyspark.sql.window import Window
from pyspark.sql.functions import rand, struct, collect_list, col, sqrt
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lakeeeeeeeeeeeee") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()

In [20]:
sc = spark.sparkContext

In [21]:
CB = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_delta_IDF")
filtered_CB = CB.filter(CB["id"].isNotNull())

In [22]:
movie_vec_dict = {int(row.id): row.vecs for row in filtered_CB.select("id", "vecs").collect()}
b_movie_vec = sc.broadcast(movie_vec_dict)
ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
train, validation, test = ratings.randomSplit([0.8, 0.1, 0.1], seed=0)
test = test.limit(500000)

In [23]:
user_hist = (test
             .groupBy("userId")
             .agg(F.collect_list(F.struct("movieId","rating")).alias("history")))

In [24]:
def predict_rating(user_history, target_id):
    vec_dict = b_movie_vec.value
    target = vec_dict.get(target_id)
    if target is None: return 0.0
    num, den = 0.0, 0.0
    for mid, r in user_history:
        v = vec_dict.get(mid)
        if v is None: continue
        # cosine similarity
        sim = float(np.dot(target, v) / (np.linalg.norm(target)*np.linalg.norm(v))) if np.linalg.norm(target)*np.linalg.norm(v)>0 else 0.0
        num += sim * r
        den += builtins.abs(sim)
    return float(num/den) if den>0 else 0.0

predict_udf = F.udf(predict_rating, FloatType())

In [25]:
ratings_in_cb = (
    test
    .join(CB.select(F.col("id").alias("cb_id")), test.movieId == F.col("cb_id"), "inner")
    .select("userId", "movieId", "rating")
    .cache()  # Cache dữ liệu để sử dụng lại
)

# Tối ưu hóa việc phân vùng trước khi groupBy
train_hist = (
    ratings_in_cb
    .repartition("userId")  # Đảm bảo phân vùng hợp lý khi groupBy theo userId
    .groupBy("userId")
    .agg(
        collect_list(struct("movieId", "rating")).alias("history")
    )
)

# Kết hợp sample_ratings với lịch sử train
eval_df = (
    ratings_in_cb
    .join(train_hist, on="userId", how="left")
)

# Dự đoán và tính bình phương sai số, tối ưu hoá phép toán dự đoán
eval_df = (
    eval_df
    .withColumn("predicted_rating", predict_udf("history", "movieId"))
    .withColumn("squared_error", (col("predicted_rating") - col("rating"))**2)
)

# Tính RMSE trực tiếp trong bước tính toán
rmse = eval_df.agg(
    sqrt(F.avg("squared_error")).alias("rmse")
).collect()[0]["rmse"]

print(f"RMSE trên ratings = {rmse:.4f}")

RMSE trên ratings = 0.2904


In [77]:
predict_rating(eval_df.select('history').take(1),96821)

0.0

In [79]:
eval_df.select('history').take(1)

[Row(history=[Row(movieId=96821, rating=5.0), Row(movieId=2762, rating=4.5), Row(movieId=147, rating=4.5), Row(movieId=2959, rating=4.0), Row(movieId=858, rating=5.0), Row(movieId=1968, rating=4.0), Row(movieId=1246, rating=5.0), Row(movieId=54503, rating=3.5), Row(movieId=4226, rating=4.0), Row(movieId=58559, rating=4.0), Row(movieId=110, rating=1.0)])]

In [82]:
vv = eval_df.select('history').take(1)
vv

[Row(history=[Row(movieId=96821, rating=5.0), Row(movieId=2762, rating=4.5), Row(movieId=147, rating=4.5), Row(movieId=2959, rating=4.0), Row(movieId=858, rating=5.0), Row(movieId=1968, rating=4.0), Row(movieId=1246, rating=5.0), Row(movieId=54503, rating=3.5), Row(movieId=4226, rating=4.0), Row(movieId=58559, rating=4.0), Row(movieId=110, rating=1.0)])]

In [109]:
vec_dict = b_movie_vec.value

target = vec_dict.get(147)
print(target)
if target is None: re = 0.0
num, den = 0.0, 0.0
for mid, r in vv[0].history:
    v = vec_dict.get(mid)
    if v is None: continue
    # cosine similarity
    sim = float(np.dot(target, v) / (np.linalg.norm(target)*np.linalg.norm(v))) if np.linalg.norm(target)*np.linalg.norm(v)>0 else 0.0
    num += sim * r
    den += builtins.abs(sim)
C =  float(num/den) if den>0 else 0.0

[-0.004987547054256806, -0.007048711687435999, -0.0020082383895950284, -0.04742521244813414, 0.0032200708382708184, 0.026919821441611824, 0.0015610888667117037, -0.01393796325496891, 0.0062209684401750565, -0.009778990882777554, -0.039882565026774126, -0.00022151310900774072, -0.03062977734953165, 0.021649353151374003, 0.02263843553031192, -0.011469860489288454, 0.01451953437508029, -0.04132008873035802, 0.029998779406442362, 0.05683883016600328, -0.011933329282328486, -0.01909188923480756, -0.02031881389934022, -0.01670564037254628, 0.03186120921471978, -0.04048683603896814, 0.027679304010234773, 0.0042531426776857935, -0.014645636629532366, -0.012813672560321935, 0.030011408551431754, -0.01661553411908886, -0.01982816686744199, -0.02952092428527334, 0.03528837217976723, 0.020873948397553143, -0.0005107921931673499, 3.884684102719321e-05, 0.012430900933823603, 0.028060641495839637, -0.027300839017912307, -0.026436581385924536, 0.010153024210868512, -0.025497245040776974, -0.0165084844

In [111]:
C

3.936687137226912

In [68]:
ratings_in_cb.filter(ratings_in_cb["userId"] == 1).show()


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|  96821|   5.0|
|     1|   2762|   4.5|
|     1|    147|   4.5|
|     1|   2959|   4.0|
|     1|    858|   5.0|
|     1|   1968|   4.0|
|     1|   1246|   5.0|
|     1|  54503|   3.5|
|     1|   4226|   4.0|
|     1|  58559|   4.0|
|     1|    110|   1.0|
+------+-------+------+



In [52]:
ratings_in_cb.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|  96821|   5.0|
|     1|   2762|   4.5|
|     1|    147|   4.5|
|     1|   2959|   4.0|
|     1|    858|   5.0|
|     1|   1968|   4.0|
|     1|   1246|   5.0|
|     1|  54503|   3.5|
|     1|   4226|   4.0|
|     1|  58559|   4.0|
|     1|    110|   1.0|
+------+-------+------+



In [59]:
print(calculate_weighted_sim_for_user_movie(1,58559))

4.011539388193611


In [32]:
def cosine(v1, v2):
    import numpy as np
    v1, v2 = np.array(v1), np.array(v2)
    denom = np.linalg.norm(v1) * np.linalg.norm(v2)
    return float(np.dot(v1, v2) / denom) if denom > 0 else 0.0

def getvec(id):
    row = CB.filter(CB['id'] == id).select("vecs").head()
    return row["vecs"] if row else None
def calculate_weighted_sim_for_user_movie(user_id, movie_id):
    # Lấy các phim mà user đã rating
    filterRating = ratings.filter(ratings["userId"] == user_id).collect()

    target_vec = getvec(movie_id)  # vector của phim cần dự đoán

    numerator = 0.0
    denominator = 0.0

    for row in filterRating:
        rated_movie_id = row["movieId"]
        rating = row["rating"]

        rated_vec = getvec(rated_movie_id)
        if rated_vec is None or target_vec is None:
            continue

        sim = cosine(target_vec, rated_vec)

        numerator += sim * rating
        denominator += builtins.abs(sim)

    predicted_rating = numerator / denominator if denominator > 0 else 0.0
    return predicted_rating