In [1]:
from pyspark.sql.functions import col, abs, sum as spark_sum, pow, sqrt, udf, expr
from pyspark.sql.types import FloatType
# import numpy as np
from pyspark.sql import functions as F
import builtins
from pyspark.sql.window import Window
from pyspark.sql.functions import rand, struct, collect_list, col, sqrt
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, FloatType

In [2]:

spark = SparkSession.builder \
    .appName("HybridRecommenderMaxPerformance") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "1") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "4g") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()



25/05/30 02:05:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
df_merge = spark.read.format("delta").load("s3a://lakehouse/bronze/Bronze_Crews_API").cache()
# df_merge.count()

25/05/30 02:06:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [5]:
df_merge.show(5,False)

[Stage 8:>                                                          (0 + 1) / 1]

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [9]:
df_merge = spark.read.format("delta").load("s3a://lakehouse/bronze/Bronze_Movies_API").cache()
df_merge.count()

                                                                                

40

In [10]:
df_merge.show(5,False)

+-------+---------------------------+---------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------------------------------+--------+---------------------------------------------------------+--------+----------+--------+------------+----------+------------+------------------------------------------------+-----------------------+
|id     |original_title             |title                      |original_language|overview                                                                                                            

In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType, IntegerType, ArrayType
from pyspark.sql.window import Window
from pyspark.sql.functions import pandas_udf, when, array, col
import pandas as pd
import numpy as np
import time



# ------------------------ Load Metadata ------------------------
df_merge = spark.read.format("delta").load("s3a://lakehouse/gold/MergeData").cache()
df_meta = df_merge.select("id", "genres", "keyword_names", "cast_names", "director_names").cache()

# ------------------------ Cosine Similarity UDF ------------------------
@pandas_udf(FloatType())
def cosine_similarity_pd(v1: pd.Series, v2: pd.Series) -> pd.Series:
    return pd.Series([
        float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) if np.linalg.norm(a) and np.linalg.norm(b) else 0.0
        for a, b in zip(v1, v2)
    ])

# ------------------------ Recommend Function ------------------------
def recommend_movies_bert(path, fixed_ids):
    df_vec = spark.read.format("delta").load(path).select("id", "vecs") \
        .withColumn("id", F.col("id").cast(IntegerType())) \
        .filter(F.col("vecs").isNotNull()).cache()

    input_df = df_vec.filter(F.col("id").isin(fixed_ids)) \
        .withColumnRenamed("id", "input_movie_id") \
        .withColumnRenamed("vecs", "input_vec")

    candidates = df_vec.withColumnRenamed("id", "candidate_movie_id") \
                       .withColumnRenamed("vecs", "candidate_vec")

    joined = input_df.crossJoin(candidates) \
        .filter("input_movie_id != candidate_movie_id") \
        .withColumn("score", cosine_similarity_pd("input_vec", "candidate_vec"))

    window = Window.partitionBy("input_movie_id").orderBy(F.desc("score"))
    top_k = joined.withColumn("rank", F.row_number().over(window)).filter("rank <= 10") \
                  .withColumn("model_type", F.lit("bert"))

    return top_k

# ------------------------ Enrich Metadata + Score ------------------------
def enrich_and_score(df_topk, df_meta):
    df_input_meta = df_meta.withColumnRenamed("id", "input_movie_id") \
        .withColumnRenamed("genres", "genres_input") \
        .withColumnRenamed("keyword_names", "keywords_input") \
        .withColumnRenamed("cast_names", "cast_names_input") \
        .withColumnRenamed("director_names", "director_name_input")

    df_cand_meta = df_meta.withColumnRenamed("id", "candidate_movie_id") \
        .withColumnRenamed("genres", "genres_candidate") \
        .withColumnRenamed("keyword_names", "keywords_candidate") \
        .withColumnRenamed("cast_names", "cast_names_candidate") \
        .withColumnRenamed("director_names", "director_name_candidate")

    enriched = df_topk.join(df_input_meta, "input_movie_id", "left") \
                      .join(df_cand_meta, "candidate_movie_id", "left")

    for col_name in [
        "genres_input", "genres_candidate",
        "keywords_input", "keywords_candidate",
        "cast_names_input", "cast_names_candidate"
    ]:
        enriched = enriched.withColumn(col_name, when(col(col_name).isNull(), array()).otherwise(col(col_name)))

    return enriched \
        .withColumn("has_common_genre", F.expr("arrays_overlap(genres_input, genres_candidate)")) \
        .withColumn("has_common_keyword", F.expr("arrays_overlap(keywords_input, keywords_candidate)")) \
        .withColumn("has_common_cast", F.expr("arrays_overlap(cast_names_input, cast_names_candidate)")) \
        .withColumn("has_common_director", F.col("director_name_input") == F.col("director_name_candidate")) \
        .withColumn("content_match_score",
            F.col("has_common_genre").cast("int") +
            F.col("has_common_keyword").cast("int") +
            F.col("has_common_cast").cast("int") +
            F.col("has_common_director").cast("int")
        )

# ------------------------ Run for Single Movie ------------------------
start = time.time()
print("\n🚀 Running BERT Recommendation for 1 movie...")

test_movie_id = 862  # 🔍 Replace with movie ID you want to test (e.g. 603 for The Matrix)
bert_path = "s3a://lakehouse/data/bert"

top_k_single = recommend_movies_bert(bert_path, [test_movie_id])
final_df_single = enrich_and_score(top_k_single, df_meta)

# Thông tin phim gốc
print(f"\n🎬 Original Movie Info (ID: {test_movie_id}):")
df_meta.filter(F.col("id") == test_movie_id).show(truncate=False)

# Hiển thị 10 phim gợi ý kèm điểm
print(f"\n🔍 Top 10 Similar Movies to ID {test_movie_id}:\n")
final_df_single.select(
    "candidate_movie_id",
    "score",
    "content_match_score",
    "genres_candidate",
    "keywords_candidate",
    "cast_names_candidate",
    "director_name_candidate",
    "has_common_genre",
    "has_common_keyword",
    "has_common_cast",
    "has_common_director"
).orderBy(F.desc("score")).show(10, truncate=False)

print(f"\n⏱️ Elapsed Time: {time.time() - start:.2f}s")


In [4]:
pip install streamlit

Defaulting to user installation because normal site-packages is not writeable
Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m487.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading GitPython-3.1.44-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.6/207.6 kB[0m [31m788.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tenacity<10,>=8.1.0
  Downloading tenacity-9.1.2-py3-none-any.whl (28 kB)
Collecting pillow<12,>=7.1.0
  Downloading pillow-11.2.1-cp39-cp39-manylinux_2_28_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m734.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting blinker<2,>=1.5.0
  Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Collecting altair<6,>=4.0
  Downloading altair-5.5

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType, IntegerType, ArrayType
from pyspark.sql.window import Window
from pyspark.sql.functions import pandas_udf
import pandas as pd
import numpy as np
import time

# ------------------------ Spark Session (bạn cần chạy từ Jupyter Local hoặc Docker) ------------------------
# spark = SparkSession.builder...  # Nếu bạn chưa tạo, đảm bảo session có cấu hình MinIO & Delta như trước

# ------------------------ 1. Load Metadata ------------------------
df_merge = spark.read.format("delta").load("s3a://lakehouse/gold/MergeData").cache()
fixed_movie_ids = df_merge.select("id").limit(100).toPandas()["id"].tolist()

df_meta = df_merge.select("id", "genres", "keyword_names", "cast_names", "director_names").cache()





25/05/25 07:57:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/05/25 07:57:51 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [4]:
from pyspark.sql.functions import pandas_udf, when, array, col

# ------------------------ 2. Cosine Similarity: Pandas UDF ------------------------
@pandas_udf(FloatType())
def cosine_similarity_pd(v1: pd.Series, v2: pd.Series) -> pd.Series:
    return pd.Series([
        float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) if np.linalg.norm(a) and np.linalg.norm(b) else 0.0
        for a, b in zip(v1, v2)
    ])

# ------------------------ 3. Recommend Function ------------------------
def recommend_movies_bert(path, fixed_ids):
    df_vec = spark.read.format("delta").load(path).select("id", "vecs") \
        .withColumn("id", F.col("id").cast(IntegerType())) \
        .filter(F.col("vecs").isNotNull()).cache()  # ✅ thêm dòng này

    input_df = df_vec.filter(F.col("id").isin(fixed_ids)) \
        .withColumnRenamed("id", "input_movie_id") \
        .withColumnRenamed("vecs", "input_vec")

    candidates = df_vec.withColumnRenamed("id", "candidate_movie_id") \
                       .withColumnRenamed("vecs", "candidate_vec")

    joined = input_df.crossJoin(candidates) \
        .filter("input_movie_id != candidate_movie_id") \
        .withColumn("score", cosine_similarity_pd("input_vec", "candidate_vec"))

    window = Window.partitionBy("input_movie_id").orderBy(F.desc("score"))
    top_k = joined.withColumn("rank", F.row_number().over(window)).filter("rank <= 3") \
                  .withColumn("model_type", F.lit("bert"))

    return top_k

# ------------------------ 4. Enrich with Metadata & Compute Match Score ------------------------
def enrich_and_score(df_topk, df_meta):
    df_input_meta = df_meta.withColumnRenamed("id", "input_movie_id") \
        .withColumnRenamed("genres", "genres_input") \
        .withColumnRenamed("keyword_names", "keywords_input") \
        .withColumnRenamed("cast_names", "cast_names_input") \
        .withColumnRenamed("director_names", "director_name_input")

    df_cand_meta = df_meta.withColumnRenamed("id", "candidate_movie_id") \
        .withColumnRenamed("genres", "genres_candidate") \
        .withColumnRenamed("keyword_names", "keywords_candidate") \
        .withColumnRenamed("cast_names", "cast_names_candidate") \
        .withColumnRenamed("director_names", "director_name_candidate")

    enriched = df_topk.join(df_input_meta, "input_movie_id", "left") \
                      .join(df_cand_meta, "candidate_movie_id", "left")

    # Fill nulls in array columns
    for col_name in [
        "genres_input", "genres_candidate",
        "keywords_input", "keywords_candidate",
        "cast_names_input", "cast_names_candidate"
    ]:
        enriched = enriched.withColumn(col_name, when(col(col_name).isNull(), array()).otherwise(col(col_name)))

    return enriched \
        .withColumn("has_common_genre", F.expr("arrays_overlap(genres_input, genres_candidate)")) \
        .withColumn("has_common_keyword", F.expr("arrays_overlap(keywords_input, keywords_candidate)")) \
        .withColumn("has_common_cast", F.expr("arrays_overlap(cast_names_input, cast_names_candidate)")) \
        .withColumn("has_common_director", F.col("director_name_input") == F.col("director_name_candidate")) \
        .withColumn("content_match_score",
            F.col("has_common_genre").cast("int") +
            F.col("has_common_keyword").cast("int") +
            F.col("has_common_cast").cast("int") +
            F.col("has_common_director").cast("int")
        )

In [5]:

# ------------------------ 5. Run & Summary ------------------------
start = time.time()
print("\n🚀 Running BERT Recommendation...")

bert_path = "s3a://lakehouse/data/bert"
top_k = recommend_movies_bert(bert_path, fixed_movie_ids)
final_df = enrich_and_score(top_k, df_meta)

summary = final_df.groupBy("input_movie_id").agg(
    F.coalesce(F.avg(F.col("has_common_genre").cast("int")), F.lit(0.0)).alias("avg_common_genre"),
    F.coalesce(F.avg(F.col("has_common_keyword").cast("int")), F.lit(0.0)).alias("avg_common_keyword"),
    F.coalesce(F.avg(F.col("has_common_cast").cast("int")), F.lit(0.0)).alias("avg_common_cast"),
    F.coalesce(F.avg(F.col("has_common_director").cast("int")), F.lit(0.0)).alias("avg_common_director"),
    F.coalesce(F.avg("content_match_score"), F.lit(0.0)).alias("avg_content_match_score")
)

# print("\n📊 BERT Summary:")
# summary.orderBy("input_movie_id").show(truncate=False)

overall = summary.select(
    F.lit("Bert").alias("model_type"),
    F.avg("avg_common_genre").alias("overall_genre"),
    F.avg("avg_common_keyword").alias("overall_keyword"),
    F.avg("avg_common_cast").alias("overall_cast"),
    F.avg("avg_common_director").alias("overall_director"),
    F.avg("avg_content_match_score").alias("overall_match_score")
)

print("\n✅ BERT Overall Performance:")
overall.show(truncate=False)
print(f"\n⏱️ Elapsed Time: {time.time() - start:.2f}s")


🚀 Running BERT Recommendation...


                                                                                


✅ BERT Overall Performance:


[Stage 17:=>(40 + 8) / 64][Stage 18:>   (0 + 0) / 8][Stage 19:>   (0 + 0) / 8]

25/05/25 04:30:55 WARN BlockManagerMaster: Failed to remove broadcast 13 with removeFromMaster = true - Cannot receive any reply from /172.18.0.5:40946 in 120 seconds. This timeout is controlled by spark.rpc.askTimeout
org.apache.spark.rpc.RpcTimeoutException: Cannot receive any reply from /172.18.0.5:40946 in 120 seconds. This timeout is controlled by spark.rpc.askTimeout
	at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:47)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:62)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:58)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at scala.util.Failure.recover(Try.scala:234)
	at scala.concurrent.Future.$anonfun$recover$1(Future.scala:395)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transf

[Stage 23:>                                                         (0 + 2) / 2]

+----------+------------------+------------------+------------+-------------------+-------------------+
|model_type|overall_genre     |overall_keyword   |overall_cast|overall_director   |overall_match_score|
+----------+------------------+------------------+------------+-------------------+-------------------+
|Bert      |0.9566666666666669|0.4833333333333335|0.14        |0.10333333333333332|1.685000000000001  |
+----------+------------------+------------------+------------+-------------------+-------------------+


⏱️ Elapsed Time: 654.18s


                                                                                

In [6]:

# ------------------------ 5. Run & Summary ------------------------
start = time.time()
print("\n🚀 Running Word2vec Recommendation...")

bert_path = "s3a://lakehouse/data/word2vec"
top_k = recommend_movies_bert(bert_path, fixed_movie_ids)
final_df = enrich_and_score(top_k, df_meta)

summary = final_df.groupBy("input_movie_id").agg(
    F.coalesce(F.avg(F.col("has_common_genre").cast("int")), F.lit(0.0)).alias("avg_common_genre"),
    F.coalesce(F.avg(F.col("has_common_keyword").cast("int")), F.lit(0.0)).alias("avg_common_keyword"),
    F.coalesce(F.avg(F.col("has_common_cast").cast("int")), F.lit(0.0)).alias("avg_common_cast"),
    F.coalesce(F.avg(F.col("has_common_director").cast("int")), F.lit(0.0)).alias("avg_common_director"),
    F.coalesce(F.avg("content_match_score"), F.lit(0.0)).alias("avg_content_match_score")
)

# print("\n📊 WORD2vec Summary:")
# summary.orderBy("input_movie_id").show(truncate=False)

overall = summary.select(
    F.lit("Word2vec").alias("model_type"),
    F.avg("avg_common_genre").alias("overall_genre"),
    F.avg("avg_common_keyword").alias("overall_keyword"),
    F.avg("avg_common_cast").alias("overall_cast"),
    F.avg("avg_common_director").alias("overall_director"),
    F.avg("avg_content_match_score").alias("overall_match_score")
)

print("\n✅ Word2vec Overall Performance:")
overall.show(truncate=False)
print(f"\n⏱️ Elapsed Time: {time.time() - start:.2f}s")


🚀 Running Word2vec Recommendation...


                                                                                


✅ Word2vec Overall Performance:


[Stage 53:>                                                         (0 + 2) / 2]

+----------+------------------+------------------+-------------------+-------------------+-------------------+
|model_type|overall_genre     |overall_keyword   |overall_cast       |overall_director   |overall_match_score|
+----------+------------------+------------------+-------------------+-------------------+-------------------+
|Word2vec  |0.9633333333333333|0.4000000000000001|0.06666666666666665|0.04666666666666667|1.4783333333333337 |
+----------+------------------+------------------+-------------------+-------------------+-------------------+


⏱️ Elapsed Time: 160.01s


                                                                                

In [5]:

# ------------------------ 5. Run & Summary ------------------------
start = time.time()
print("\n🚀 Running Tfidf Recommendation...")

bert_path = "s3a://lakehouse/data/all_movies_delta_IDF"
top_k = recommend_movies_bert(bert_path, fixed_movie_ids)
final_df = enrich_and_score(top_k, df_meta)

summary = final_df.groupBy("input_movie_id").agg(
    F.coalesce(F.avg(F.col("has_common_genre").cast("int")), F.lit(0.0)).alias("avg_common_genre"),
    F.coalesce(F.avg(F.col("has_common_keyword").cast("int")), F.lit(0.0)).alias("avg_common_keyword"),
    F.coalesce(F.avg(F.col("has_common_cast").cast("int")), F.lit(0.0)).alias("avg_common_cast"),
    F.coalesce(F.avg(F.col("has_common_director").cast("int")), F.lit(0.0)).alias("avg_common_director"),
    F.coalesce(F.avg("content_match_score"), F.lit(0.0)).alias("avg_content_match_score")
)

# print("\n📊 Tfidf Summary:")
# summary.orderBy("input_movie_id").show(truncate=False)

overall = summary.select(
    F.lit("TFIDF").alias("model_type"),
    F.avg("avg_common_genre").alias("overall_genre"),
    F.avg("avg_common_keyword").alias("overall_keyword"),
    F.avg("avg_common_cast").alias("overall_cast"),
    F.avg("avg_common_director").alias("overall_director"),
    F.avg("avg_content_match_score").alias("overall_match_score")
)

print("\n✅ TF-IDF Overall Performance:")
overall.show(truncate=False)
print(f"\n⏱️ Elapsed Time: {time.time() - start:.2f}s")


🚀 Running Tfidf Recommendation...


                                                                                


✅ TF-IDF Overall Performance:




+----------+------------------+---------------+-------------------+-------------------+-------------------+
|model_type|overall_genre     |overall_keyword|overall_cast       |overall_director   |overall_match_score|
+----------+------------------+---------------+-------------------+-------------------+-------------------+
|TFIDF     |0.6233333333333333|0.32           |0.17000000000000004|0.10333333333333333|1.1766666666666667 |
+----------+------------------+---------------+-------------------+-------------------+-------------------+


⏱️ Elapsed Time: 429.87s


                                                                                

In [5]:

# ------------------------ 5. Run & Summary ------------------------
start = time.time()
print("\n🚀 Running Tfidf Recommendation...")

bert_path = "s3a://lakehouse/data/tfidf"
top_k = recommend_movies_bert(bert_path, fixed_movie_ids)
final_df = enrich_and_score(top_k, df_meta)

summary = final_df.groupBy("input_movie_id").agg(
    F.coalesce(F.avg(F.col("has_common_genre").cast("int")), F.lit(0.0)).alias("avg_common_genre"),
    F.coalesce(F.avg(F.col("has_common_keyword").cast("int")), F.lit(0.0)).alias("avg_common_keyword"),
    F.coalesce(F.avg(F.col("has_common_cast").cast("int")), F.lit(0.0)).alias("avg_common_cast"),
    F.coalesce(F.avg(F.col("has_common_director").cast("int")), F.lit(0.0)).alias("avg_common_director"),
    F.coalesce(F.avg("content_match_score"), F.lit(0.0)).alias("avg_content_match_score")
)

# print("\n📊 Tfidf Summary:")
# summary.orderBy("input_movie_id").show(truncate=False)

overall = summary.select(
    F.lit("TFIDF").alias("model_type"),
    F.avg("avg_common_genre").alias("overall_genre"),
    F.avg("avg_common_keyword").alias("overall_keyword"),
    F.avg("avg_common_cast").alias("overall_cast"),
    F.avg("avg_common_director").alias("overall_director"),
    F.avg("avg_content_match_score").alias("overall_match_score")
)

print("\n✅ TF-IDF Overall Performance:")
overall.show(truncate=False)
print(f"\n⏱️ Elapsed Time: {time.time() - start:.2f}s")


🚀 Running Tfidf Recommendation...


                                                                                


✅ TF-IDF Overall Performance:


[Stage 17:=>(40 + 8) / 64][Stage 18:>   (0 + 0) / 8][Stage 19:>   (0 + 0) / 8]

25/05/25 07:59:49 ERROR TaskSchedulerImpl: Lost executor 0 on 172.18.0.3: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/05/25 07:59:49 WARN TaskSetManager: Lost task 42.0 in stage 17.0 (TID 247) (172.18.0.3 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/05/25 07:59:49 WARN TaskSetManager: Lost task 44.0 in stage 17.0 (TID 249) (172.18.0.3 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/05/25 07:59:49 WARN TaskSetManager: Lost task 47.0 in stage 17.0 (TID 252) (172.18.0.3 executor 0): ExecutorLostFailure (executor 0 exited caus

[Stage 17:==(66 + 8) / 64][Stage 18:>   (0 + 0) / 8][Stage 19:>   (0 + 0) / 8]

25/05/25 08:02:49 ERROR TaskSchedulerImpl: Lost executor 1 on 172.18.0.3: worker lost
25/05/25 08:02:49 WARN TaskSetManager: Lost task 49.0 in stage 17.0 (TID 280) (172.18.0.3 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: worker lost
25/05/25 08:02:49 WARN TaskSetManager: Lost task 48.0 in stage 17.0 (TID 279) (172.18.0.3 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: worker lost
25/05/25 08:02:49 WARN TaskSetManager: Lost task 51.0 in stage 17.0 (TID 282) (172.18.0.3 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: worker lost


[Stage 17:==(66 + 6) / 64][Stage 18:>   (0 + 0) / 8][Stage 19:>   (0 + 0) / 8]

25/05/25 08:05:14 WARN TaskSetManager: Lost task 50.0 in stage 17.0 (TID 281) (172.18.0.3 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: worker lost
25/05/25 08:05:14 ERROR TaskSchedulerImpl: Lost executor 2 on 172.18.0.3: worker lost
25/05/25 08:05:14 WARN TaskSetManager: Lost task 47.1 in stage 17.0 (TID 254) (172.18.0.3 executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: worker lost
25/05/25 08:05:14 WARN TaskSetManager: Lost task 41.1 in stage 17.0 (TID 253) (172.18.0.3 executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: worker lost
25/05/25 08:05:14 WARN TaskSetManager: Lost task 42.1 in stage 17.0 (TID 256) (172.18.0.3 executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: worker lost
25/05/25 08:05:14 WARN TaskSetManager: Lost task 44.1 in stage 17.0 (TID 255) (172.18.0.3 executor 2): ExecutorLostFailure

[Stage 23:>                                                         (0 + 2) / 2]

+----------+------------------+------------------+-------------------+--------------------+-------------------+
|model_type|overall_genre     |overall_keyword   |overall_cast       |overall_director    |overall_match_score|
+----------+------------------+------------------+-------------------+--------------------+-------------------+
|TFIDF     |0.7966666666666669|0.4200000000000001|0.04666666666666667|0.026666666666666665|1.3233333333333328 |
+----------+------------------+------------------+-------------------+--------------------+-------------------+


⏱️ Elapsed Time: 623.49s


                                                                                

In [7]:
from pyspark.sql.functions import (
    regexp_replace, broadcast, col, collect_set, concat_ws,
    sort_array
)
fact_movies = spark.read.format("delta").load("s3a://lakehouse/gold/fact_movies")
dim_movie = spark.read.format("delta").load("s3a://lakehouse/gold/dim_movie")
dim_cast = broadcast(spark.read.format("delta").load("s3a://lakehouse/gold/dim_cast"))
movie_cast = spark.read.format("delta").load("s3a://lakehouse/gold/movie_cast")
movie_genre = spark.read.format("delta").load("s3a://lakehouse/gold/movie_genre")
dim_genre = broadcast(spark.read.format("delta").load("s3a://lakehouse/gold/dim_genre"))
movie_crew = spark.read.format("delta").load("s3a://lakehouse/gold/movie_crew")
dim_crew = broadcast(spark.read.format("delta").load("s3a://lakehouse/gold/dim_crew"))
dim_keyword = broadcast(spark.read.format("delta").load("s3a://lakehouse/gold/dim_keyword"))
movie_keyword = spark.read.format("delta").load("s3a://lakehouse/gold/movie_keyword")

                                                                                

In [9]:
cast_agg = (
    movie_cast.alias("mca")
    .filter(F.col("mca.order") < 5)
    .join(dim_cast.alias("dca"), F.col("mca.cast_id") == F.col("dca.id"), "inner")
    .groupBy(F.col("mca.movie_id"))
    .agg(F.sort_array(F.collect_set(F.col("dca.name"))).alias("cast_names"))
    .withColumnRenamed("movie_id", "id")
)

# keyword_agg: gom keyword, giữ nguyên tên
keyword_agg = (
    movie_keyword.alias("mk")
    .join(dim_keyword.alias("dk"), F.col("mk.keyword_id") == F.col("dk.id"), "inner")
    .groupBy(F.col("mk.id"))   # CHỈNH SỬA: group by movie_id, không phải id
    .agg(F.sort_array(F.collect_set(F.col("dk.name"))).alias("keyword_names"))
)

# crew_agg: lấy director, giữ nguyên tên
crew_agg = (
    movie_crew.alias("mcr")
    .filter(F.col("mcr.job") == "Director")
    .join(dim_crew.alias("dcr"), F.col("mcr.crew_id") == F.col("dcr.id"), "inner")
    .groupBy(F.col("mcr.movie_id"))
    .agg(F.sort_array(F.collect_set(F.col("dcr.name"))).alias("director_names"))
    .withColumnRenamed("movie_id", "id")
)

genre_agg = (
    movie_genre.alias("mg")
    .join(dim_genre.alias("dg"), col("mg.genres_id") == col("dg.id"), "inner")
    .groupBy(col("mg.id"))
    .agg(sort_array(collect_set(col("dg.name"))).alias("genres"))
)

# ====== 4. Join với fact_movies và xử lý dữ liệu tổng hợp ======
result_df = (
    fact_movies.alias("fm")
    .join(cast_agg.alias("ca"), col("fm.id") == col("ca.id"), "left")  
    .join(crew_agg.alias("cr"), col("fm.id") == col("cr.id"), "left")  
    .join(genre_agg.alias("ga"), col("fm.id") == col("ga.id"), "left")
    .join(keyword_agg.alias("ka"), col("fm.id") == col("ka.id"), "left")
    .join(dim_movie.alias("dm"), col("fm.id") == col("dm.id"),"left")
    .select(
        col("fm.id"),col("dm.title"), col("fm.budget"), col("fm.popularity"), col("fm.revenue"),
        col("fm.vote_average"), col("fm.vote_count"), col("fm.date_id"),
        col("ca.cast_names"), col("cr.director_names"),
        col("ga.genres"), col("ka.keyword_names"),col("dm.overview")
    )
)

result_df.show(10,False)

                                                                                

+---+----------------------------------+--------+----------+-----------+------------+----------+--------+-------------------------------------------------------------------------------+----------------+-------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
df_merge.select("cast_names","director_names","id").show(10,False)

+-----------------------------------------------+---------------------------+---+
|cast_names                                     |director_names             |id |
+-----------------------------------------------+---------------------------+---+
|[MattiPellonpää, SusannaHaavisto, TuroPajala]  |[AkiKaurismäki]            |2  |
|[MichaelHynson, RobertAugust]                  |[BruceBrown]               |21 |
|[GeoffreyRush, JohnnyDepp, OrlandoBloom]       |[GoreVerbinski]            |22 |
|[LucyLiu, UmaThurman, VivicaA.Fox]             |[QuentinTarantino]         |24 |
|[MarlonBrando, MartinSheen, RobertDuvall]      |[FrancisFordCoppola]       |28 |
|[DanCastellaneta, JulieKavner, NancyCartwright]|[DavidSilverman]           |35 |
|[GaelGarcíaBernal, GoyaToledo, VanessaBauche]  |[AlejandroGonzálezIñárritu]|55 |
|[HeidiHayes, MariaBello, ViggoMortensen]       |[DavidCronenberg]          |59 |
|[BradPitt, BruceWillis, MadeleineStowe]        |[TerryGilliam]             |63 |
|[ClintEastwood,