In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler
from pyspark.sql import functions as F
import math
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, DateType
from pyspark.sql.functions import (
    col, from_json, explode, to_date, date_format,
    dayofweek, dayofmonth, dayofyear, weekofyear,
    month, quarter, year, when, unix_timestamp,concat_ws
)
from delta.tables import DeltaTable
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec
from pyspark.ml import Pipeline


In [2]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .getOrCreate()

In [3]:
df = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")


In [4]:
df.count()

7368

In [5]:
df.head(1)

[Row(id=11, budget=11000000, popularity=42.149697, revenue=775398007.0, vote_average=8.1, vote_count=6778.0, date_id=19770525, language="[{'iso_639_1': 'en', 'name': 'English'}]", overview='Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.', tagline='A long time ago in a galaxy far, far away...', runtime=121.0, cast_names=['David Ankrum', 'Marcus Powell', 'Jerry Walter', 'Peter Cushing', 'Shane Rimmer', 'Warwick Diamond', 'Roy Straite', 'Mahjoub', 'Tom Sylla', 'Grant McCune', 'Harold Weed', 'Colin Higgins', 'Alfie Curtis', 'Jack Klaff', 'Frank Henson', 'Peter Sturgeon', 'Mandy Morton', 'Jeremy Sinden', 'William Hootkins', 'Larry Ward', 'Gilda Cohen', 'Peter Sumner', 'Joe Johnston', 'Ted Gagliano', 'Pam Rose', 'James Earl Jones'

In [8]:
df = df.dropna(subset=["overview", "keyword_names"])


In [32]:
df = df.fillna({"overview": ""})

In [11]:
df = df.withColumn('comb', concat_ws(" ", col('keyword_names'), col('cast_names'), col('director_names'), col('genres')))


In [15]:
df = df.select('comb')

In [16]:
df.head(1)

[Row(comb="hoodlum bet woman director los angeles new year's eve witch sperm episode film hotel hotel room Allison Anders Robert Rodriguez Alexandre Rockwell Quentin Tarantino Crime Comedy Patricia Vonne Salma Hayek Kimberly Blair Marc Lawrence Ione Skye Antonio Banderas Sammi Davis Amanda de Cadenet Lana McKissack Bruce Willis Jennifer Beals Kathy Griffin Lawrence Bender Lili Taylor Alicia Witt Danny Verduzco David Proval Marisa Tomei Valeria Golino Madonna Paul Calderon Tamlyn Tomita Quentin Tarantino Tim Roth")]

In [46]:
tokenizer = Tokenizer(inputCol="comb", outputCol="comb_tokens")
df = tokenizer.transform(df)

In [47]:
stopword_remover = StopWordsRemover(inputCol="overview_tokens", outputCol="filtered_overview")
df = stopword_remover.transform(df)


In [48]:
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="filtered_overview", outputCol="overview_vector")
model = word2vec.fit(df)
df = model.transform(df)


In [36]:
df.head(1)

[Row(id=12, budget=94000000, popularity=25.497794, revenue=940335536.0, vote_average=7.6, vote_count=6292.0, date_id=20030530, language="[{'iso_639_1': 'en', 'name': 'English'}]", overview="Nemo, an adventurous young clownfish, is unexpectedly taken from his Great Barrier Reef home to a dentist's office aquarium. It's up to his worrisome father Marlin and a friendly but forgetful fish Dory to bring Nemo home -- meeting vegetarian sharks, surfer dude turtles, hypnotic jellyfish, hungry seagulls, and more along the way.", tagline="There are 3.7 trillion fish in the ocean. They're looking for one.", runtime=100.0, cast_names=['Geoffrey Rush', 'Andrew Stanton', 'Bruce Spence', 'Erica Beck', 'Willem Dafoe', 'Alexander Gould', 'Albert Brooks', 'Joe Ranft', 'Bill Hunter', 'Brad Garrett', 'Barry Humphries', 'Allison Janney', 'Elizabeth Perkins', 'Vicki Lewis', 'Stephen Root', 'Erik Per Sullivan', 'John Ratzenberger', 'LuLu Ebeling', 'Nicholas Bird', 'Ellen DeGeneres', 'Austin Pendleton', 'Bob 

In [56]:
df = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")
df = df.withColumn("combined_content", 
                   concat_ws(" ", 
                             df.overview, 
                             df.tagline, 
                             df.genres, 
                             df.cast_names, 
                             df.director_names))
tokenizer = Tokenizer(inputCol="combined_content", outputCol="combined_tokens")
df = tokenizer.transform(df)

stopword_remover = StopWordsRemover(inputCol="combined_tokens", outputCol="filtered_content")
df = stopword_remover.transform(df)

word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="filtered_content", outputCol="content_vector")
model = word2vec.fit(df)
df = model.transform(df)
df_2 = df.select("overview", "tagline", "genres", "cast_names", "director_names") 



In [57]:
df_2.head(1)

[Row(content_vector=DenseVector([0.0093, 0.0147, -0.0169, -0.0412, 0.0527, -0.0067, -0.0193, -0.0048, -0.0595, -0.012, -0.0076, 0.0292, -0.0386, 0.0514, 0.0319, 0.0299, -0.0013, 0.0053, -0.0111, -0.0354, -0.0458, 0.0165, 0.0418, -0.0043, -0.0163, -0.0312, 0.0081, -0.0212, -0.0749, -0.0128, -0.0145, -0.0088, 0.0333, -0.0321, 0.0248, 0.0023, -0.0076, 0.0028, -0.0119, 0.0232, -0.0223, -0.0209, 0.0052, -0.0034, 0.012, 0.0019, -0.0153, 0.0362, -0.0086, -0.0008, -0.0469, 0.0499, -0.033, -0.0346, -0.0594, 0.0183, -0.0268, 0.001, 0.0404, 0.0094, 0.0151, -0.0108, -0.0064, -0.0082, 0.018, 0.0067, -0.0039, -0.0366, -0.0264, 0.0038, -0.0012, 0.0315, -0.0087, 0.0014, 0.0139, 0.0114, 0.0815, -0.0356, 0.02, 0.0165, -0.0272, -0.0619, -0.0046, -0.0629, -0.0452, -0.0627, 0.0098, 0.0253, -0.0041, -0.004, -0.0069, 0.001, 0.0419, 0.0186, 0.0197, -0.0218, 0.0165, -0.0335, 0.0321, 0.001]))]

In [62]:
df = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")
df = df.dropna(subset=["overview"])
df = df.withColumn("cast_names_str", concat_ws(" ", "cast_names"))
df = df.withColumn("director_names_str", concat_ws(" ", "director_names"))
df = df.withColumn("genre_names_str", concat_ws(" ", "genres"))


df = df.select("overview", "tagline", "genre_names_str", "cast_names_str", "director_names_str") 
# Xử lý overview
overview_tokenizer = Tokenizer(inputCol="overview", outputCol="overview_tokens")
df = overview_tokenizer.transform(df)
overview_stopwords = StopWordsRemover(inputCol="overview_tokens", outputCol="overview_filtered")
df = overview_stopwords.transform(df)
word2vec_overview = Word2Vec(vectorSize=100, minCount=1, inputCol="overview_filtered", outputCol="overview_vector")
model_overview = word2vec_overview.fit(df)
df = model_overview.transform(df)

# Xử lý cast_names
cast_tokenizer = Tokenizer(inputCol="cast_names_str", outputCol="cast_tokens")
df = cast_tokenizer.transform(df)
cast_stopwords = StopWordsRemover(inputCol="cast_tokens", outputCol="cast_filtered")
df = cast_stopwords.transform(df)
word2vec_cast = Word2Vec(vectorSize=100, minCount=1, inputCol="cast_filtered", outputCol="cast_vector")
model_cast = word2vec_cast.fit(df)
df = model_cast.transform(df)

# Xử lý genres
genre_tokenizer = Tokenizer(inputCol="genre_names_str", outputCol="genre_tokens")
df = genre_tokenizer.transform(df)
genre_stopwords = StopWordsRemover(inputCol="genre_tokens", outputCol="genre_filtered")
df = genre_stopwords.transform(df)
word2vec_genre = Word2Vec(vectorSize=100, minCount=1, inputCol="genre_filtered", outputCol="genre_vector")
model_genre = word2vec_genre.fit(df)
df = model_genre.transform(df)

# Xử lý director
director_tokenizer = Tokenizer(inputCol="director_names_str", outputCol="director_tokens")
df = director_tokenizer.transform(df)
director_stopwords = StopWordsRemover(inputCol="director_tokens", outputCol="director_filtered")
df = director_stopwords.transform(df)
word2vec_director = Word2Vec(vectorSize=100, minCount=1, inputCol="director_filtered", outputCol="director_vector")
model_director = word2vec_director.fit(df)
df = model_director.transform(df)


In [66]:
df2 = df.select("overview_vector","cast_vector", "genre_vector","director_vector")
df2.head(1)

[Row(overview_vector=DenseVector([-0.0125, -0.0016, -0.0066, -0.02, -0.0058, -0.0152, 0.0071, 0.0036, -0.006, -0.0031, 0.0012, 0.0047, 0.005, 0.0077, 0.0086, 0.0051, -0.0069, 0.0318, -0.0104, -0.002, -0.0037, 0.0117, 0.0029, -0.0041, 0.0037, -0.0115, -0.0011, 0.0125, -0.0128, 0.011, 0.0067, -0.0038, 0.0173, -0.0026, 0.0246, -0.0057, -0.0113, -0.0048, -0.0199, 0.0152, -0.004, -0.0025, -0.0125, -0.0097, -0.0108, 0.0125, -0.0272, 0.0175, 0.0083, 0.003, -0.0152, 0.0115, -0.0014, 0.0213, -0.0128, -0.0128, 0.0102, 0.0043, 0.016, -0.0122, 0.0215, -0.0211, 0.0096, -0.0063, -0.0104, 0.0114, -0.0057, 0.0082, -0.0114, 0.0188, 0.0092, 0.0071, -0.0013, -0.0304, -0.0119, 0.0179, -0.0059, -0.0196, 0.0073, 0.0111, -0.0035, -0.0025, 0.009, 0.0135, -0.013, -0.0021, -0.0115, -0.0031, -0.0195, 0.0127, -0.0092, -0.0091, -0.0123, -0.0181, -0.0281, 0.0027, 0.0055, -0.0109, -0.0089, 0.0112]), cast_vector=DenseVector([0.0039, 0.0038, 0.0017, -0.0214, 0.021, -0.005, -0.0022, -0.0256, -0.0107, 0.0049, -0.0006, -