In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType,ArrayType,LongType
import logging
import sys
import traceback
import ast
import json
import os
import sys
import traceback
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr,when,to_date ,udf, concat_ws,posexplode, from_json
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO with Delta CB") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.cores", "4") \
    .config("spark.executor.cores", "4") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "1") \
    .config("spark.dynamicAllocation.maxExecutors", "1") \
    .config("spark.dynamicAllocation.initialExecutors", "1") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "1g") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .getOrCreate()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml import PipelineModel


In [5]:
spark_df = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")

In [None]:
regexTokenizer = RegexTokenizer(
    gaps=False, pattern='\w+', inputCol='comb', outputCol='token')
stopWordsRemover = StopWordsRemover(
    inputCol='token', outputCol='nostopwrd')
word2Vec = Word2Vec(vectorSize=150, minCount=3,windowSize=10, 
                    inputCol='nostopwrd', outputCol='word_vec', seed=123)

pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, word2Vec])
pipeline_model = pipeline.fit(spark_df)
pipeline_model.write().overwrite().save(
    "s3a://lakehouse/model/" + 'pipeline_model')

pipeline_mdl = PipelineModel.load("s3a://lakehouse/model/" + 'pipeline_model')
new_df = pipeline_mdl.transform(spark_df)
all_movies_vecs = new_df.select('id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
data = [(id, [float(x) for x in vec]) for id, vec in all_movies_vecs]

schema = StructType([
    StructField("id", StringType(), True),
    StructField("vecs", ArrayType(DoubleType()), True)
])

all_movies_df = spark.createDataFrame(data, schema)
all_movies_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/data/all_movies_delta")

In [None]:


pipeline_mdl = PipelineModel.load("s3a://lakehouse/model/" + 'pipeline_model')
new_df = pipeline_mdl.transform(spark_df)
all_movies_vecs = new_df.select('id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
data = [(id, [float(x) for x in vec]) for id, vec in all_movies_vecs]
schema = StructType([
    StructField("id", StringType(), True),
    StructField("vecs", ArrayType(DoubleType()), True)
])
all_movies_df = spark.createDataFrame(data, schema)
all_movies_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/data/all_movies_delta")



In [13]:
df_raw = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_delta")
df_raw.count()

45433

In [8]:
df_list = spark.read.format("delta").load("s3a://lakehouse/data/word2vec")
df_list.printSchema()

root
 |-- id: string (nullable = true)
 |-- vecs: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [None]:
import numpy as np
from pyspark.sql.functions import col, mean, lit, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.linalg import Vectors

df_merge = None
df_list = None
flag1 = False
flag2 = False


def CosineSim(vec1, vec2):
    numerator = np.dot(vec1, vec2)
    denominator = np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2))
    return float(numerator / denominator) if denominator != 0 else 0

def get_titles():
    global df_merge, flag1
    if not flag1:
        df_merge =  spark.read.format("delta").load("s3a://lakehouse/gold/MergeData")
        flag1 = True
    return list(df_merge.select("title").toPandas()["title"])

def get_vecs():
    global df_list, all_vecs, flag2
    df_list = spark.read.format("delta").load("s3a://lakehouse/data/hehe123")
    data_original = df_list.collect()
    all_vecs = [(int(row.id), Vectors.dense(row.vecs)) for row in data_original]
    flag2 = True
    return all_vecs

def recommendation(m_title, sim_mov_limit=10):
    global df_merge
    df_merge =  spark.read.format("delta").load("s3a://lakehouse/gold/MergeData")
    if df_merge.filter(col("title") == m_title).count() == 0:
        return "Sorry! The movie you searched is not in our database. Please check the spelling or try another movie."
    
    all_movies_vecs = get_vecs()
    m_id = df_merge.filter(col("title") == m_title).select(col('id')).collect()[0][0]
    # print(f"type of m_id: {type(m_id)}")
    # print(f"sample id in all_movies_vecs: {type(all_movies_vecs[0][0])}")

    input_vec = [r[1] for r in all_movies_vecs if r[0] == m_id][0]
    # print(f"type of m_id: {type(m_id)}")
    # print(f"sample id in all_movies_vecs: {type(all_movies_vecs[0][0])}")
    # print(m_id)
    # print(input_vec)
    similar_movies_rdd = spark.sparkContext.parallelize(
        [(i[0], CosineSim(input_vec, i[1])) for i in all_movies_vecs]
    )
    
    similar_movies_df = spark.createDataFrame(similar_movies_rdd, ["movies_id", "score"]) \
        .orderBy(col("score").desc()) \
        .filter(col("movies_id") != m_id) \
        .limit(sim_mov_limit)
    
    similar_movies_df = similar_movies_df.withColumn("input_movies_id", lit(m_id))
    return similar_movies_df.toPandas()

def getMovieDetails(in_mov):
    global df_merge
    vote_counts = df_merge.filter(col("vote_count").isNotNull()).select(col("vote_count"))
    vote_averages = df_merge.filter(col("vote_average").isNotNull()).select(col("vote_average"))
    C = vote_averages.select(mean("vote_average")).collect()[0][0]
    quantiles = vote_counts.approxQuantile("vote_count", [0.7], 0.001)
    m = quantiles[0]
    qualified = df_merge.filter((col("vote_count") >= m) & col("vote_count").isNotNull() & col("vote_average").isNotNull())
    qualified = qualified.withColumn("vote_count", col("vote_count").cast("int")) \
        .withColumn("vote_average", col("vote_average").cast("int"))
    weighted_rating_udf = udf(lambda v, R: (
        v / (v + m) * R) + (m / (m + v) * C), FloatType())
    qualified = qualified.withColumn("weighted_rating", weighted_rating_udf(
        col("vote_count"), col("vote_average")))
    qualified = qualified.orderBy(col("weighted_rating").desc())

    if isinstance(in_mov, str):
        return "Invalid input"
    a = in_mov.alias("a")
    b = qualified.alias("b")

    raw = a.join(b, col("a.movies_id") == col("b.id"), 'inner') \
        .orderBy("score", ascending=False) \
        .select([col('a.' + c) for c in a.columns] + [col('b.title'), col('b.genres'), col('b.keyword_names'), col("b.director_names"), col("b.cast_names"), col("b.weighted_rating")])
    
    return raw.select("movies_id", "input_movies_id", "title", "genres", "director_names", "cast_names", "score", "weighted_rating")

# if __name__ == "__main__":
#     movie_name = input("Enter a movie name: ")
#     recommendations = recommendation(movie_name, sim_mov_limit=5)
#     print("Recommended movies:")
#     print(recommendations)
if __name__ == "__main__":

    
    movie_name = input("Enter a movie name: ")
    recommendations = recommendation(movie_name, sim_mov_limit=10)

    if isinstance(recommendations, str):  # Nếu không tìm thấy phim
        print(recommendations)
    else:
        print("Recommended movies:")
        print(recommendations)

        # Chuyển đổi kiểu dữ liệu của Pandas DataFrame về đúng dạng trước khi đưa vào Spark
        recommendations = recommendations.astype({
            "movies_id": int,
            "input_movies_id": int,
            "score": float
        })

        # Xác định schema cho Spark DataFrame
        schema = StructType([
            StructField("movies_id", IntegerType(), True),
            StructField("score", FloatType(), True),
            StructField("input_movies_id", IntegerType(), True)
        ])

        # Chuyển đổi Pandas DataFrame thành danh sách từ điển
        recommendations_list = recommendations.to_dict(orient="records")

        # Chuyển đổi danh sách từ điển thành Spark DataFrame
        recommendations_spark_df = spark.createDataFrame(recommendations_list, schema=schema)
        # recommendations_spark_df.show()
        
        details = getMovieDetails(recommendations_spark_df)


        # Hiển thị thông tin chi tiết
        print("Detailed movie recommendations:")
        print(details.toPandas())  # Chuyển về Pandas DataFrame để hiển thị




Enter a movie name:  Toy Story


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
