In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType
from pyspark.ml.recommendation import ALSModel
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, lit, collect_list, struct,sqrt,avg
from pyspark.sql import types as T
import builtins

In [2]:
spark = SparkSession.builder \
    .appName("HybridRecommenderMaxPerformance") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "13g") \         # Dành phần lớn RAM cho executor
    .config("spark.driver.memory", "2g") \            # Dành một phần cho driver
    .config("spark.executor.cores", "4") \            # Dùng toàn bộ 4 cores
    .config("spark.executor.instances", "1") \        # Dùng 1 executor (vì có 1 worker)
    .config("spark.default.parallelism", "8") \       # Gợi ý phân chia task (>= cores * 2)
    .config("spark.sql.shuffle.partitions", "8") \ 
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
CB = spark.read.format("delta").load("s3a://lakehouse/data/hehe123")
filtered_CB = CB.filter(CB["id"].isNotNull())

In [4]:
# movie_vec_dict = {int(row.id): row.vecs for row in filtered_CB.select("id", "vecs").collect()}
# b_movie_vec = sc.broadcast(movie_vec_dict)
# ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
# ratings = ratings.select("userId", "movieId", "rating")
# ratings = ratings.withColumn('userId', ratings['userId'].cast('int')).\
# withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
# train, validation, test = ratings.randomSplit([0.8, 0.1, 0.1], seed=0)
movie_vec_dict = {int(row.id): row.vecs for row in filtered_CB.select("id", "vecs").collect()}
b_movie_vec = sc.broadcast(movie_vec_dict)
ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.dropna(subset=['userId', 'movieId'])
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
train, validation, test = ratings.randomSplit([0.75, 0.15, 0.15], seed=0)

In [5]:
test = test.limit(50000)
train = train.limit(50000)

In [6]:
from pyspark.ml.recommendation import ALSModel
CF_model = ALSModel.load("s3a://lakehouse/CF/als_best_model")
PredcitCF = CF_model.transform(ratings)

In [7]:
def predict_rating(user_history, target_id):
    vec_dict = b_movie_vec.value
    target = vec_dict.get(target_id)
    if target is None: return 0.0
    num, den = 0.0, 0.0
    for mid, r in user_history:
        v = vec_dict.get(mid)
        if v is None: continue
        # cosine similarity
        sim = float(np.dot(target, v) / (np.linalg.norm(target)*np.linalg.norm(v))) if np.linalg.norm(target)*np.linalg.norm(v)>0 else 0.0
        num += sim * r
        den += builtins.abs(sim)
    return float(num/den) if den>0 else 0.0

In [8]:
predict_udf = F.udf(predict_rating, FloatType())
ratings_in_cb = (
    train
    .join(CB.select(F.col("id").alias("cb_id")), train.movieId == F.col("cb_id"), "inner")
    .select("userId", "movieId", "rating")
)
# 4. Gom lịch sử train thành array per user
train_hist = (
    ratings_in_cb
    .groupBy("userId")
    .agg(
        collect_list(
            struct("movieId","rating")
        ).alias("history")
    )
)

# 5. Kết hợp sample_ratings (test) với history train
eval_df = (
    ratings_in_cb
    .join(train_hist, on="userId", how="left")
)

# 6. Dự đoán và tính bình phương sai số
eval_df = (
    eval_df
    .withColumn("predicted_rating_CT",
                predict_udf("history", "movieId"))
    .join(
            PredcitCF.select("userId", "movieId", "prediction").withColumnRenamed("prediction", "predicted_rating_CF"),
            on=["userId", "movieId"],
            how="inner"
        )
)

In [9]:
eval_df.count()

48230

In [10]:
# Chỉ giữ lại các cột cần thiết
eval_df = eval_df.select("predicted_rating_CT", "predicted_rating_CF", "rating")

# Cache để tránh tính lại nhiều lần
eval_df.cache()
eval_df.count()  # Action để trigger cache

# Hàm tính RMSE cho mỗi alpha
def compute_rmse(alpha):
    hybrid_df = eval_df.withColumn(
        "predicted_hybrid",
        lit(alpha) * col("predicted_rating_CT") + (1 - alpha) * col("predicted_rating_CF")
    )
    rmse = hybrid_df.withColumn(
        "squared_error", (col("predicted_hybrid") - col("rating")) ** 2
    ).agg(sqrt(avg("squared_error")).alias("rmse")).collect()[0]["rmse"]
    return rmse

# Tìm best alpha
best_alpha = None
best_rmse = float("inf")
alpha_rmse_list = []

for alpha in np.arange(0.0, 1.05, 0.05):
    rmse = compute_rmse(alpha)
    alpha_rmse_list.append((alpha, rmse))
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

# In kết quả
print("Alpha - RMSE:")
for alpha, rmse in alpha_rmse_list:
    print(f"{alpha:.2f} -> {rmse:.4f}")

print(f"\n✅ Best alpha = {best_alpha:.2f} with RMSE = {best_rmse:.4f}")

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60638)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/

Py4JError: An error occurred while calling o191.count