In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType,ArrayType,LongType
import logging
import sys
import traceback
import ast
import json
import os
import sys
import traceback
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr,when,to_date ,udf, concat_ws,posexplode, from_json
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType
from pyspark.sql import functions as F

In [2]:

spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .getOrCreate()

In [3]:
silver_merge_df = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")


In [4]:
silver_merge_df.count()

45435

In [5]:
silver_merge_df.show()

+---+--------------------+
| id|                comb|
+---+--------------------+
| 21|surfing surfboard...|
| 24|femaleyakuza gore...|
| 25|sniper golfwar pe...|
| 38|lovestory dejavu ...|
| 62|technology jupite...|
| 70|boxer boxingtrain...|
| 73|skinhead basketba...|
| 80|talking dialogue ...|
| 82|havanacuba basedo...|
| 90|weapon gunfight s...|
| 95|spacemarine savin...|
| 98|ancientworld glad...|
|113|child buddhistmon...|
|121|hobbit orcs wizar...|
|143|youngsoldier tren...|
|156|cleaninglady suic...|
|160|cinema train begi...|
|167|help mental doubt...|
|195|secretary emotion...|
|214|famoustheme shoti...|
+---+--------------------+
only showing top 20 rows



In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import RegexTokenizer, CountVectorizer

In [7]:
spark_df = silver_merge_df.select('id', 'comb')
regexTokenizer = RegexTokenizer(
    gaps=False, pattern='\w+', inputCol='comb', outputCol='token')
stopWordsRemover = StopWordsRemover(
    inputCol='token', outputCol='nostopwrd')
# countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
# iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
word2Vec = Word2Vec(vectorSize=150, minCount=3,windowSize=10, 
                    inputCol='nostopwrd', outputCol='word_vec', seed=123)
# vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, word2Vec])
pipeline_model = pipeline.fit(spark_df)
pipeline_model.write().overwrite().save(
    "s3a://lakehouse/model/" + 'pipeline_model')

In [8]:
from pyspark.ml import PipelineModel


In [9]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("vecs", ArrayType(DoubleType()), True)
])
spark_df = silver_merge_df.select('id', 'comb')
pipeline_mdl = PipelineModel.load("s3a://lakehouse/model/" + 'pipeline_model')
new_df = pipeline_mdl.transform(spark_df)
all_movies_vecs = new_df.select('id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
data = [(id, [float(x) for x in vec]) for id, vec in all_movies_vecs]

all_movies_df = spark.createDataFrame(data, schema)
all_movies_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/data/all_movies_delta")



In [10]:
df_merge = None
df_list = None
flag1 = False
flag2 = False

In [11]:
import numpy as np
from pyspark.sql.functions import col, mean, lit, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.linalg import Vectors

df_merge = None
df_list = None
flag1 = False
flag2 = False


def CosineSim(vec1, vec2):
    numerator = np.dot(vec1, vec2)
    denominator = np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2))
    return float(numerator / denominator) if denominator != 0 else 0

def get_titles():
    global df_merge, flag1
    if not flag1:
        df_merge = spark.read.format("delta").load("s3a://lakehouse/merge_data-movies/merged_data")
        flag1 = True
    return list(df_merge.select("title").toPandas()["title"])

def get_vecs(path):
    df_list = spark.read.format("delta").load(path)
    data_original = df_list.collect()
    all_vecs = [(row.id, Vectors.dense(row.vecs)) for row in data_original]
    return all_vecs

def recommendation(m_title, path, sim_mov_limit=5):
    global df_merge
    df_merge = spark.read.format("delta").load("s3a://lakehouse/merge_data-movies/merged_data")
    if df_merge.filter(col("title") == m_title).count() == 0:
        return "Sorry! The movie you searched is not in our database. Please check the spelling or try another movie."
    
    all_movies_vecs = get_vecs(path)
    m_id = df_merge.filter(col("title") == m_title).select(col('id')).collect()[0][0]
    input_vec = [r[1] for r in all_movies_vecs if r[0] == m_id][0]
    
    similar_movies_rdd = spark.sparkContext.parallelize(
        [(i[0], CosineSim(input_vec, i[1])) for i in all_movies_vecs]
    )
    
    similar_movies_df = spark.createDataFrame(similar_movies_rdd, ["movies_id", "score"]) \
        .orderBy(col("score").desc()) \
        .filter(col("movies_id") != m_id) \
        .limit(sim_mov_limit)
    
    similar_movies_df = similar_movies_df.withColumn("input_movies_id", lit(m_id))
    return similar_movies_df.toPandas()

def getMovieDetails(in_mov):
    global df_merge
    vote_counts = df_merge.filter(col("vote_count").isNotNull()).select(col("vote_count"))
    vote_averages = df_merge.filter(col("vote_average").isNotNull()).select(col("vote_average"))
    C = vote_averages.select(mean("vote_average")).collect()[0][0]
    quantiles = vote_counts.approxQuantile("vote_count", [0.7], 0.001)
    m = quantiles[0]
    qualified = df_merge.filter((col("vote_count") >= m) & col("vote_count").isNotNull() & col("vote_average").isNotNull())
    qualified = qualified.withColumn("vote_count", col("vote_count").cast("int")) \
        .withColumn("vote_average", col("vote_average").cast("int"))
    weighted_rating_udf = udf(lambda v, R: (
        v / (v + m) * R) + (m / (m + v) * C), FloatType())
    qualified = qualified.withColumn("weighted_rating", weighted_rating_udf(
        col("vote_count"), col("vote_average")))
    qualified = qualified.orderBy(col("weighted_rating").desc())

    if isinstance(in_mov, str):
        return "Invalid input"
    a = in_mov.alias("a")
    b = qualified.alias("b")

    raw = a.join(b, col("a.movies_id") == col("b.id"), 'inner') \
        .orderBy("score", ascending=False) \
        .select([col('a.' + c) for c in a.columns] + [col('b.title'), col('b.genres_convert'), col('b.keyword_convert'), col("b.director"), col("b.cast_names"), col("b.weighted_rating")])
    
    return raw.select("movies_id", "input_movies_id", "title", "genres_convert", "director", "cast_names", "score", "weighted_rating")

# if __name__ == "__main__":
#     movie_name = input("Enter a movie name: ")
#     recommendations = recommendation(movie_name, sim_mov_limit=5)
#     print("Recommended movies:")
#     print(recommendations)
def haha(path):
    movie_name = input("Enter a movie name: ")
    recommendations = recommendation(movie_name,path, sim_mov_limit=5)

    if isinstance(recommendations, str):  # Nếu không tìm thấy phim
        print(recommendations)
    else:
        print("Recommended movies:")
        print(recommendations)

        # Chuyển đổi kiểu dữ liệu của Pandas DataFrame về đúng dạng trước khi đưa vào Spark
        recommendations = recommendations.astype({
            "movies_id": int,
            "input_movies_id": int,
            "score": float
        })

        # Xác định schema cho Spark DataFrame
        schema = StructType([
            StructField("movies_id", IntegerType(), True),
            StructField("score", FloatType(), True),
            StructField("input_movies_id", IntegerType(), True)
        ])

        # Chuyển đổi Pandas DataFrame thành danh sách từ điển
        recommendations_list = recommendations.to_dict(orient="records")

        # Chuyển đổi danh sách từ điển thành Spark DataFrame
        recommendations_spark_df = spark.createDataFrame(recommendations_list, schema=schema)
        # recommendations_spark_df.show()
        
        details = getMovieDetails(recommendations_spark_df)


        # Hiển thị thông tin chi tiết
        print("Detailed movie recommendations:")
        print(details.toPandas())  # Chuyển về Pandas DataFrame để hiển thị




In [12]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType
from pyspark.ml.linalg import Vectors
import numpy as np

# Bước 1: Chuẩn bị dữ liệu đầu vào
spark_df = silver_merge_df.select('id', 'comb')

# Bước 2: Tokenizer
regexTokenizer = RegexTokenizer(inputCol='comb', outputCol='token', pattern='\\w+')
token_df = regexTokenizer.transform(spark_df)

# Bước 3: Xóa stop words
remover = StopWordsRemover(inputCol='token', outputCol='nostopwrd')
nostop_df = remover.transform(token_df)

# Bước 4: Count Vectorizer
cv = CountVectorizer(inputCol='nostopwrd', outputCol='raw_features')
cv_model = cv.fit(nostop_df)
featurized_df = cv_model.transform(nostop_df)

# Bước 5: TF-IDF
idf = IDF(inputCol='raw_features', outputCol='tfidf_vec')
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)

# Bước 6: Chuẩn hóa và lưu lại vector
data = rescaled_df.select("id", "tfidf_vec").rdd.map(
    lambda x: (x[0], [float(i) for i in x[1].toArray()])
).collect()

schema = StructType([
    StructField("id", StringType(), True),
    StructField("vecs", ArrayType(DoubleType()), True)
])

all_movies_df = spark.createDataFrame(data, schema)
all_movies_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/data/all_movies_tfidf_delta")


In [15]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

# Bước 1–3: Tokenizer + StopWordsRemover
regexTokenizer = RegexTokenizer(inputCol='comb', outputCol='tokens', pattern='\\w+')
remover = StopWordsRemover(inputCol="tokens", outputCol="nostopwrd")

token_df = regexTokenizer.transform(spark_df)
nostop_df = remover.transform(token_df)

# Bước 4: HashingTF
hashingTF = HashingTF(inputCol="nostopwrd", outputCol="raw_features", numFeatures=10000)
hashed_df = hashingTF.transform(nostop_df)

# Bước 5: TF-IDF
idf = IDF(inputCol="raw_features", outputCol="hashing_tfidf_vec")
idf_model = idf.fit(hashed_df)
rescaled_df = idf_model.transform(hashed_df)

# Bước 6: Dùng UDF để convert SparseVector -> array
@udf(returnType=ArrayType(DoubleType()))
def to_array(v):
    return v.toArray().tolist()

# Bước 7: Thêm cột 'vecs' chứa vector TF-IDF dạng list, rồi ghi xuống Delta
final_df = rescaled_df.withColumn("vecs", to_array("hashing_tfidf_vec")).select("id", "vecs")

final_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/data/all_movies_hashingtfidf_delta")


In [26]:
haha("s3a://lakehouse/data/all_movies_tfidf_delta")

Enter a movie name:  The Fury


Recommended movies:
  movies_id     score input_movies_id
0       170  0.984405           12611
1     22825  0.981441           12611
2    168098  0.979164           12611
3      5876  0.971829           12611
4     11336  0.970294           12611
Detailed movie recommendations:
   movies_id  input_movies_id          title                  genres_convert   
0        170            12611  28 Days Later  Horror Thriller ScienceFiction  \
1      22825            12611        The Box         Thriller ScienceFiction   
2     168098            12611           Cell  Horror ScienceFiction Thriller   
3       5876            12611       The Mist  ScienceFiction Horror Thriller   
4      11336            12611  The Dead Zone  Horror ScienceFiction Thriller   

          director                                  cast_names     score   
0       DannyBoyle      CillianMurphy NaomieHarris NoahHuntley  0.984405  \
1     RichardKelly      CameronDiaz JamesMarsden FrankLangella  0.981441   
2      TodW

In [27]:
haha("s3a://lakehouse/data/all_movies_delta")

Enter a movie name:  The Fury


Recommended movies:
  movies_id     score input_movies_id
0       170  0.984405           12611
1     22825  0.981441           12611
2    168098  0.979164           12611
3      5876  0.971829           12611
4     11336  0.970294           12611
Detailed movie recommendations:
   movies_id  input_movies_id          title                  genres_convert   
0        170            12611  28 Days Later  Horror Thriller ScienceFiction  \
1      22825            12611        The Box         Thriller ScienceFiction   
2     168098            12611           Cell  Horror ScienceFiction Thriller   
3       5876            12611       The Mist  ScienceFiction Horror Thriller   
4      11336            12611  The Dead Zone  Horror ScienceFiction Thriller   

          director                                  cast_names     score   
0       DannyBoyle      CillianMurphy NaomieHarris NoahHuntley  0.984405  \
1     RichardKelly      CameronDiaz JamesMarsden FrankLangella  0.981441   
2      TodW

In [24]:
df = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_delta")
df.filter(df.id == "347155").show(truncate=False)


+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
df = spark.read.format("delta").load("s3a://lakehouse/data/all_movies_tfidf_delta")
df.filter(df.id == "347155").show(truncate=False)


+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------