In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType
from pyspark.ml.recommendation import ALSModel
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, lit, collect_list, struct
from pyspark.sql import types as T

spark = SparkSession.builder \
    .appName("MinIO with Delta Lakeeeeeeeeeeeee") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()
sc = spark.sparkContext

# 1. Chuẩn bị dữ liệu và broadcast movie vectors
CB = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_delta")
filtered_CB = CB.filter(CB["id"].isNotNull())
movie_vec_dict = {int(row.id): row.vecs for row in filtered_CB.select("id", "vecs").collect()}
b_movie_vec = sc.broadcast(movie_vec_dict)

ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.withColumn('userId', ratings['userId'].cast('int'))\
                 .withColumn('movieId', ratings['movieId'].cast('int'))\
                 .withColumn('rating', ratings['rating'].cast('float'))
train, validation, test = ratings.randomSplit([0.8, 0.1, 0.1], seed=0)
# 2. Tải mô hình ALS và dự đoán CF
CF_model = ALSModel.load("s3a://lakehouse/CF/als_best_model")
PredcitCF = CF_model.transform(ratings)
test = test.limit(20000)
# 3. Lọc ratings chỉ giữ phim có trong CB
ratings_in_cb = test.join(CB.select(F.col("id").alias("cb_id")), 
                              test.movieId == F.col("cb_id"), "inner")\
                       .select("userId", "movieId", "rating")
# 4. Gom lịch sử người dùng thành array

ratings_in_cb.count()

8735

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType
from pyspark.ml.recommendation import ALSModel
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, lit, collect_list, struct,sqrt,avg
from pyspark.sql import types as T
import builtins

In [11]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lakeeeeeeeeeeeee") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()
sc = spark.sparkContext

In [12]:
CB = spark.read.format("delta").load("s3a://lakehouse/data/haha")
filtered_CB = CB.filter(CB["id"].isNotNull())

In [13]:
movie_vec_dict = {int(row.id): row.vecs for row in filtered_CB.select("id", "vecs").collect()}
b_movie_vec = sc.broadcast(movie_vec_dict)
ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
train, validation, test = ratings.randomSplit([0.8, 0.1, 0.1], seed=0)


In [14]:
from pyspark.ml.recommendation import ALSModel
CF_model = ALSModel.load("s3a://lakehouse/CF/als_best_model")
PredcitCF = CF_model.transform(ratings)

In [15]:
def predict_rating(user_history, target_id):
    vec_dict = b_movie_vec.value
    target = vec_dict.get(target_id)
    if target is None: return 0.0
    num, den = 0.0, 0.0
    for mid, r in user_history:
        v = vec_dict.get(mid)
        if v is None: continue
        # cosine similarity
        sim = float(np.dot(target, v) / (np.linalg.norm(target)*np.linalg.norm(v))) if np.linalg.norm(target)*np.linalg.norm(v)>0 else 0.0
        num += sim * r
        den += builtins.abs(sim)
    return float(num/den) if den>0 else 0.0

In [16]:
predict_udf = F.udf(predict_rating, FloatType())
ratings_in_cb = (
    test
    .join(CB.select(F.col("id").alias("cb_id")), test.movieId == F.col("cb_id"), "inner")
    .select("userId", "movieId", "rating")

)
# 4. Gom lịch sử train thành array per user
train_hist = (
    ratings_in_cb
    .groupBy("userId")
    .agg(
        collect_list(
            struct("movieId","rating")
        ).alias("history")
    )
)

# 5. Kết hợp sample_ratings (test) với history train
eval_df = (
    ratings_in_cb
    .join(train_hist, on="userId", how="left")
)

# 6. Dự đoán và tính bình phương sai số
eval_df = (
    eval_df
    .withColumn("predicted_rating_CT",
                predict_udf("history", "movieId"))
    .join(
            PredcitCF.select("userId", "movieId", "prediction").withColumnRenamed("prediction", "predicted_rating_CF"),
            on=["userId", "movieId"],
            how="left"
        )
)

In [17]:
# Chỉ giữ lại các cột cần thiết
eval_df = eval_df.select("predicted_rating_CT", "predicted_rating_CF", "rating")

# Cache để tránh tính lại nhiều lần
eval_df.cache()
eval_df.count()  # Action để trigger cache

# Hàm tính RMSE cho mỗi alpha
def compute_rmse(alpha):
    hybrid_df = eval_df.withColumn(
        "predicted_hybrid",
        lit(alpha) * col("predicted_rating_CT") + (1 - alpha) * col("predicted_rating_CF")
    )
    rmse = hybrid_df.withColumn(
        "squared_error", (col("predicted_hybrid") - col("rating")) ** 2
    ).agg(sqrt(avg("squared_error")).alias("rmse")).collect()[0]["rmse"]
    return rmse

# Tìm best alpha
best_alpha = None
best_rmse = float("inf")
alpha_rmse_list = []

for alpha in np.arange(0.0, 1.05, 0.05):
    rmse = compute_rmse(alpha)
    alpha_rmse_list.append((alpha, rmse))
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

# In kết quả
print("Alpha - RMSE:")
for alpha, rmse in alpha_rmse_list:
    print(f"{alpha:.2f} -> {rmse:.4f}")

print(f"\n✅ Best alpha = {best_alpha:.2f} with RMSE = {best_rmse:.4f}")

Alpha - RMSE:
0.00 -> 0.7403
0.05 -> 0.7365
0.10 -> 0.7338
0.15 -> 0.7322
0.20 -> 0.7317
0.25 -> 0.7324
0.30 -> 0.7342
0.35 -> 0.7371
0.40 -> 0.7411
0.45 -> 0.7462
0.50 -> 0.7524
0.55 -> 0.7596
0.60 -> 0.7678
0.65 -> 0.7770
0.70 -> 0.7871
0.75 -> 0.7981
0.80 -> 0.8100
0.85 -> 0.8228
0.90 -> 0.8363
0.95 -> 0.8506
1.00 -> 0.8656

✅ Best alpha = 0.20 with RMSE = 0.7317
