In [4]:
from pyspark.sql.functions import col, abs, sum as spark_sum, pow, sqrt, udf, expr
from pyspark.sql.types import FloatType
import numpy as np
from pyspark.sql import functions as F
import builtins
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lakeeeeeeeeeeeee") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config("spark.sql.warehouse.dir", "s3a://lakehouse/") \
    .getOrCreate()

In [13]:
CB = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_delta")
movie_vec_dict = {row.id: row.vecs for row in CB.select("id","vecs").collect()}
b_movie_vec = sc.broadcast(movie_vec_dict)

In [15]:
user_hist = (ratings
             .groupBy("userId")
             .agg(F.collect_list(F.struct("movieId","rating")).alias("history")))

root
 |-- id: string (nullable = true)
 |-- vecs: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [7]:
ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))

train, ratings_test, validation  = ratings.randomSplit([0.8, 0.1, 0.1], seed=0)
 ratings_test=ratings_test.litmit(20000)

In [8]:
from pyspark.ml.recommendation import ALSModel
CF_model = ALSModel.load("s3a://lakehouse/CF/als_best_model")
CF_model.transform(ratings_test)

DataFrame[userId: int, movieId: int, rating: float, prediction: float]

In [28]:
CB.printSchema()

root
 |-- id: string (nullable = true)
 |-- vecs: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [30]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [None]:
def predict_rating(user_history, target_id):
    vec_dict = b_movie_vec.value
    target = vec_dict.get(target_id)
    if target is None: return 0.0
    num, den = 0.0, 0.0
    for mid, r in user_history:
        v = vec_dict.get(mid)
        if v is None: continue
        # cosine similarity
        sim = float(np.dot(target, v) / (np.linalg.norm(target)*np.linalg.norm(v))) if np.linalg.norm(target)*np.linalg.norm(v)>0 else 0.0
        num += sim * r
        den += abs(sim)
    return float(num/den) if den>0 else 0.0

predict_udf = F.udf(predict_rating, FloatType())

In [None]:
to_predict = user_hist.crossJoin(
    spark.createDataFrame(target_movie_ids, IntegerType()).toDF("movieId")
)

In [None]:
result = to_predict.withColumn(
    "predicted_rating",
    predict_udf("history","movieId")
)

In [26]:
ratings.head(1)

[Row(userId=1, movieId=110, rating=1.0),
 Row(userId=1, movieId=147, rating=4.5),
 Row(userId=1, movieId=858, rating=5.0),
 Row(userId=1, movieId=1221, rating=5.0),
 Row(userId=1, movieId=1246, rating=5.0),
 Row(userId=1, movieId=1968, rating=4.0),
 Row(userId=1, movieId=2762, rating=4.5),
 Row(userId=1, movieId=2918, rating=5.0),
 Row(userId=1, movieId=2959, rating=4.0),
 Row(userId=1, movieId=4226, rating=4.0),
 Row(userId=1, movieId=4878, rating=5.0),
 Row(userId=1, movieId=5577, rating=5.0),
 Row(userId=1, movieId=33794, rating=4.0),
 Row(userId=1, movieId=54503, rating=3.5),
 Row(userId=1, movieId=58559, rating=4.0),
 Row(userId=1, movieId=59315, rating=5.0),
 Row(userId=1, movieId=68358, rating=5.0),
 Row(userId=1, movieId=69844, rating=5.0),
 Row(userId=1, movieId=73017, rating=5.0),
 Row(userId=1, movieId=81834, rating=5.0)]

In [50]:
def cosine(v1, v2):
    import numpy as np
    v1, v2 = np.array(v1), np.array(v2)
    denom = np.linalg.norm(v1) * np.linalg.norm(v2)
    return float(np.dot(v1, v2) / denom) if denom > 0 else 0.0

def getvec(id):
    row = CB.filter(CB['id'] == id).select("vecs").head()
    return row["vecs"] if row else None
def calculate_weighted_sim_for_user_movie(user_id, movie_id):
    # Lấy các phim mà user đã rating
    filterRating = ratings.filter(ratings["userId"] == user_id).collect()

    target_vec = getvec(movie_id)  # vector của phim cần dự đoán

    numerator = 0.0
    denominator = 0.0

    for row in filterRating:
        rated_movie_id = row["movieId"]
        rating = row["rating"]

        rated_vec = getvec(rated_movie_id)
        if rated_vec is None or target_vec is None:
            continue

        sim = cosine(target_vec, rated_vec)

        numerator += sim * rating
        denominator += builtins.abs(sim)

    predicted_rating = numerator / denominator if denominator > 0 else 0.0
    return predicted_rating

27