In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType,ArrayType,LongType
import logging
import sys
import traceback
import ast
import json
import os
import sys
import traceback
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr,when,to_date ,udf, concat_ws,posexplode, from_json
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType
from pyspark.sql import functions as F

In [2]:



spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", "3") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "1") \
    .config("spark.dynamicAllocation.maxExecutors", "1") \
    .config("spark.dynamicAllocation.initialExecutors", "1") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "1g") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .config("spark.sql.pivotMaxValues", "100000") \
    .getOrCreate()

In [3]:
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
import numpy as np
import time
import numpy as np
from pyspark.sql.functions import col, mean, lit, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.linalg import Vectors
from pyspark.ml.recommendation import ALSModel


In [4]:
# Đọc dữ liệu từ Delta
movies = spark.read.format("delta").load("s3a://lakehouse/gold/dim_movie")
movies = movies.select("id", "title")
movies = movies.withColumnRenamed("id", "movieId")

ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
# ratings = ratings.limit(1000000)

# Chuyển đổi kiểu dữ liệu
df = ratings.withColumn('userId', ratings['userId'].cast('int')).\
    withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [6]:

# Hàm tính RMSE
def RMSE(predictions):
    predictions = predictions.withColumn("squared_error", (F.col("rating") - F.col("prediction")) ** 2)
    return predictions.agg(F.sqrt(F.mean("squared_error"))).collect()[0][0]

In [10]:
df_merge = None
df_list = None
flag1 = False
flag2 = False

In [11]:
def get_vecs():
    global df_list, all_vecs, flag2
    if not flag2:
        df_list = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_delta")
        data_original = df_list.collect()
        all_vecs = [(row.id, Vectors.dense(row.vecs)) for row in data_original]
        flag2 = True
    return all_vecs

In [12]:
# Hàm tính toán Cosine Similarity (Content-Based)
def CosineSim(vec1, vec2):
    numerator = np.dot(vec1, vec2)
    denominator = np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2))
    return float(numerator / denominator) if denominator != 0 else 0

In [13]:
def get_cb_recommendations(m_title,sim_mov_limit =50):
    global df_merge
    df_merge = spark.read.format("delta").load("s3a://lakehouse/merge_data-movies/merged_data")
    
    # Kiểm tra xem phim có trong cơ sở dữ liệu không
    if df_merge.filter(F.col("title") == m_title).count() == 0:
        return "Sorry! The movie you searched is not in our database. Please check the spelling or try another movie."
    
    # Lấy vector đặc trưng của các bộ phim
    all_movies_vecs = get_vecs()
    
    # Lấy movieId của phim nhập vào
    m_id = df_merge.filter(F.col("title") == m_title).select(F.col('id')).collect()[0][0]
    
    # Lấy vector của bộ phim đã nhập
    input_vec = [r[1] for r in all_movies_vecs if r[0] == m_id][0]
    
    # Tính độ tương đồng cosine giữa các phim
    similar_movies_rdd = spark.sparkContext.parallelize(
        [(i[0], CosineSim(input_vec, i[1])) for i in all_movies_vecs]
    )
    
    # Chuyển kết quả sang DataFrame và sắp xếp theo độ tương đồng
    similar_movies_df = spark.createDataFrame(similar_movies_rdd, ["movies_id", "score"]) \
        .orderBy(F.col("score").desc()) \
        .filter(F.col("movies_id") != m_id) \
        .limit(sim_mov_limit)
    
    similar_movies_df = similar_movies_df.withColumn("input_movies_id", lit(m_id))
    
    # Đổi tên cột 'movies_id' thành 'movieId' để phù hợp với cột của DataFrame CF
    similar_movies_df = similar_movies_df.withColumnRenamed("movies_id", "movieId")
    
    return similar_movies_df

In [16]:



# def hybrid_recommendation(user_id, m_title, sim_mov_limit, sim_weight, cf_weight):
#     # Gợi ý Content-Based
#     cb_recommendations = get_cb_recommendations(m_title, sim_mov_limit)

#     # Lấy thông tin tên phim từ bảng movies
#     movies_df = spark.read.format("delta").load("s3a://lakehouse/gold/dim_movie").select(
#         F.col("id").alias("movieId"), "title"
#     )

#     # Kiểm tra nếu cb_recommendations trả về thông báo lỗi
#     if isinstance(cb_recommendations, str):
#         return cb_recommendations

#     # Kiểm tra user có tồn tại trong tập train hay không
#     user_in_train = train.filter(train['userId'] == user_id).count() > 5
#     if not user_in_train:
#         print(f"User {user_id} không không đủ điều kiện => sử dụng Content-Based.")
#         return cb_recommendations.join(movies_df, on="movieId", how="inner") \
#                                  .select("movieId", "title", F.col("score").alias("hybrid_score"))

#     # Lấy danh sách các phim chưa xem (dựa trên tập train)
#     all_movies = df.select('movieId').distinct()
#     user_movies = train.filter(train['userId'] == user_id).select('movieId').distinct()
#     movies_to_recommend = all_movies.subtract(user_movies)

#     # Dự đoán với CF
#     recommendations_cf = final_model.transform(movies_to_recommend.withColumn('userId', F.lit(user_id)))
#     recommendations_cf = recommendations_cf.filter(F.col('prediction').isNotNull() & (F.col('prediction') > 0))

#     # Gán tên cột để phân biệt điểm số
#     cb_recommendations = cb_recommendations.withColumn("score_cb", F.col("score"))
#     recommendations_cf = recommendations_cf.withColumn("score_cf", F.col("prediction"))

#     # Kết hợp hai nguồn
#     hybrid_recommendations = cb_recommendations.join(recommendations_cf, on="movieId", how="inner")

#     # Join với bảng movie để lấy tiêu đề
#     hybrid_recommendations = hybrid_recommendations.join(movies_df, on="movieId", how="inner")

#     # Tính điểm kết hợp
#     hybrid_recommendations = hybrid_recommendations.withColumn(
#         "hybrid_score",
#         sim_weight * F.col("score_cb") + cf_weight * F.col("score_cf")
#     )

#     # Trả kết quả
#     return hybrid_recommendations.select("movieId", "title", "hybrid_score") \
#                                  .orderBy(F.col("hybrid_score").desc())


def hybrid_recommendation(user_id, m_title, sim_weight, cf_weight):
    cb_recommendations = get_cb_recommendations(m_title)

    movies_df = spark.read.format("delta").load("s3a://lakehouse/merge_data-movies/merged_data") \
        .select(F.col("id").cast("int").alias("movieId"), "title")

    if isinstance(cb_recommendations, str):
        return cb_recommendations

    user_in_train = train.filter(train['userId'] == user_id).count() > 5
    if not user_in_train:
        print(f"User {user_id} không đủ điều kiện => sử dụng Content-Based.")
        return cb_recommendations.join(movies_df, on="movieId", how="left") \
                                 .select("movieId", "title", F.col("score").alias("hybrid_score"))

    all_movies = df.select('movieId').distinct()
    user_movies = train.filter(train['userId'] == user_id).select('movieId').distinct()
    movies_to_recommend = all_movies.subtract(user_movies)
    temp = ALSModel.load("s3a://lakehouse/CF/als_best_model")
    recommendations_cf = temp.transform(movies_to_recommend.withColumn('userId', F.lit(user_id)))
    recommendations_cf = recommendations_cf.filter(F.col('prediction').isNotNull() & (F.col('prediction') > 0)) \
                                           .withColumnRenamed("prediction", "score_cf")
                                                                    
    cb_recommendations = cb_recommendations.withColumnRenamed("score", "score_cb")
    cb_recommendations = cb_recommendations.withColumn("movieId", F.col("movieId").cast("int"))

    # joined_df = cb_recommendations.join(recommendations_cf, on="movieId", how="outer")
    # joined_df.filter(F.col("score_cb").isNotNull()).show() 


    # ⚠️ Outer join để giữ lại cả phim chỉ có ở 1 nguồn
    hybrid_recommendations = cb_recommendations.join(recommendations_cf, on="movieId", how="outer")
    hybrid_recommendations = hybrid_recommendations.filter(F.col("score_cb").isNotNull())
    # Tính điểm kết hợp có điều kiện
    hybrid_recommendations = hybrid_recommendations.withColumn(
        "hybrid_score",
        F.when(F.col("score_cb").isNotNull() & F.col("score_cf").isNotNull(),
               sim_weight * F.col("score_cb") + cf_weight * F.col("score_cf"))
         .when(F.col("score_cb").isNotNull(), F.col("score_cb"))
         .otherwise(F.col("score_cf"))
    )

    hybrid_recommendations = hybrid_recommendations.join(movies_df, on="movieId", how="inner")
    return hybrid_recommendations.select("movieId", "title", "score_cb", "score_cf", "hybrid_score") \
                                 .orderBy(F.col("hybrid_score").desc()).limit(10)



In [17]:
# Bước 5: Hiển thị gợi ý phim cho người dùng
user_id = 25
movie_title = "The Box"  # Ví dụ phim mà người dùng yêu thích

hybrid_recs = hybrid_recommendation(user_id, movie_title, sim_weight=0.4, cf_weight=0.6)
hybrid_recs.show()

+-------+------------------+------------------+---------+------------------+
|movieId|             title|          score_cb| score_cf|      hybrid_score|
+-------+------------------+------------------+---------+------------------+
|    185|A Clockwork Orange|0.9406853119084989|3.1617157|2.2733035723189414|
|    170|     28 Days Later|0.9786354691088911|3.0811553|2.2401473677277846|
|   5876|          The Mist|0.9622204895486176|2.9214437|2.1377544162936903|
|    281|      Strange Days|0.9595120629139977| 2.793185|2.0597158225563463|
|  48385|Indestructible Man|0.9379811621588382|2.6539543| 1.967565025364634|
|    754|          Face/Off|0.9367997084469001| 2.583702|1.9249411358201662|
|   1562|    28 Weeks Later|0.9684669484449843| 2.519845|1.8992937846880522|
|   3509|  A Scanner Darkly|0.9540791103841367|2.4025085|1.8231367424965501|
|    167|             K-PAX|0.9466598252287608|2.3705876|1.8010164825054447|
|   3162|      Blackenstein|0.9419374601305377|2.2431684|1.7226759964728693|

In [18]:
import json

# Load raw JSON
metadata_path = "s3a://lakehouse/CF/als_best_model/metadata/part-00000"
metadata_raw = spark.sparkContext.textFile(metadata_path).collect()
metadata_dict = json.loads(metadata_raw[0])

# In thông tin
print(json.dumps(metadata_dict, indent=4))


{
    "class": "org.apache.spark.ml.recommendation.ALSModel",
    "timestamp": 1744621090009,
    "sparkVersion": "3.3.2",
    "uid": "ALS_bfa7aac9dc3e",
    "paramMap": {
        "userCol": "userId",
        "coldStartStrategy": "drop",
        "itemCol": "movieId",
        "blockSize": 4096
    },
    "defaultParamMap": {
        "userCol": "user",
        "coldStartStrategy": "nan",
        "itemCol": "item",
        "blockSize": 4096,
        "predictionCol": "prediction"
    },
    "rank": 12
}
