In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType
from pyspark.ml.recommendation import ALSModel
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, lit, collect_list, struct,sqrt,avg
from pyspark.sql import types as T


In [6]:
spark = SparkSession.builder \
    .appName("HybridRecommenderMaxPerformance") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "1") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()
sc = spark.sparkContext



25/05/17 18:23:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
CB = spark.read.format("delta").load("s3a://lakehouse/data/hehe123")
filtered_CB = CB.filter(CB["id"].isNotNull())

25/05/17 18:23:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/05/17 18:23:36 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [8]:
movie_vec_dict = {int(row.id): row.vecs for row in filtered_CB.select("id", "vecs").collect()}
b_movie_vec = sc.broadcast(movie_vec_dict)
ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.dropna(subset=['userId', 'movieId'])

ratings = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
train, validation, test = ratings.randomSplit([0.7, 0.2, 0.1], seed=0)
test = test.limit(100000)

                                                                                

In [9]:
from pyspark.ml.recommendation import ALSModel
CF_model = ALSModel.load("s3a://lakehouse/CF/als_best_model")
PredcitCF = CF_model.transform(test)

In [10]:
def predict_rating(user_history, target_id):
    vec_dict = b_movie_vec.value
    target = vec_dict.get(target_id)
    if target is None: return 0.0
    num, den = 0.0, 0.0
    for mid, r in user_history:
        v = vec_dict.get(mid)
        if v is None: continue
        # cosine similarity
        sim = float(np.dot(target, v) / (np.linalg.norm(target)*np.linalg.norm(v))) if np.linalg.norm(target)*np.linalg.norm(v)>0 else 0.0
        num += sim * r
        den += builtins.abs(sim)
    return float(num/den) if den>0 else 0.0

In [11]:
# Thay vì lấy lịch sử từ ratings_in_cb (dựa trên test), ta lấy từ train
predict_udf = F.udf(predict_rating, FloatType())

# 1. ratings_in_cb từ test (giữ nguyên để đánh giá)
ratings_in_cb = (
    test
    .join(CB.select(F.col("id").alias("cb_id")), test.movieId == F.col("cb_id"), "inner")
    .select("userId", "movieId", "rating")
)

# 2. Tạo ratings_in_cb_train từ train để gom lịch sử user
ratings_in_cb_train = (
    train
    .join(CB.select(F.col("id").alias("cb_id")), train.movieId == F.col("cb_id"), "inner")
    .select("userId", "movieId", "rating")
)

# 3. Gom lịch sử train thành array per user (dùng từ ratings_in_cb_train)
train_hist = (
    ratings_in_cb_train
    .groupBy("userId")
    .agg(
        collect_list(
            struct("movieId","rating")
        ).alias("history")
    )
)

# 4. Kết hợp sample_ratings (test) với history train (giữ nguyên)
eval_df = (
    ratings_in_cb
    .join(train_hist, on="userId", how="inner")
)


# 5. Dự đoán content-based rating bằng UDF và thêm cột predicted_rating_CT
eval_df = (
    eval_df
    .withColumn("predicted_rating_CT", predict_udf("history", "movieId"))
    # Join với CF prediction
    .join(
        PredcitCF.select("userId", "movieId", "prediction").withColumnRenamed("prediction", "predicted_rating_CF"),
        on=["userId", "movieId"],
        how="inner"
    )
)

In [None]:
# Chỉ giữ lại các cột cần thiết
eval_df = eval_df.select("predicted_rating_CT", "predicted_rating_CF", "rating")

# Cache để tránh tính lại nhiều lần
eval_df.cache()
eval_df.count()  # Action để trigger cache

# Hàm tính RMSE cho mỗi alpha
def compute_rmse(alpha):
    hybrid_df = eval_df.withColumn(
        "predicted_hybrid",
        lit(alpha) * col("predicted_rating_CT") + (1 - alpha) * col("predicted_rating_CF")
    )
    rmse = hybrid_df.withColumn(
        "squared_error", (col("predicted_hybrid") - col("rating")) ** 2
    ).agg(sqrt(avg("squared_error")).alias("rmse")).collect()[0]["rmse"]
    return rmse

# Tìm best alpha
best_alpha = None
best_rmse = float("inf")
alpha_rmse_list = []

for alpha in np.arange(0.0, 1.05, 0.05):
    rmse = compute_rmse(alpha)
    alpha_rmse_list.append((alpha, rmse))
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

# In kết quả
print("Alpha - RMSE:")
for alpha, rmse in alpha_rmse_list:
    print(f"{alpha:.2f} -> {rmse:.4f}")

print(f"\n✅ Best alpha = {best_alpha:.2f} with RMSE = {best_rmse:.4f}")

