In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql import functions as F
import math
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, DateType
from pyspark.sql.functions import (
    col, from_json, explode, to_date, date_format,
    dayofweek, dayofmonth, dayofyear, weekofyear,
    month, quarter, year, when, unix_timestamp
)
from delta.tables import DeltaTable
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec
from pyspark.ml import Pipeline


In [9]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .getOrCreate()

In [23]:
df = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")


In [24]:
df.count()

7368

In [26]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- budget: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- revenue: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: double (nullable = true)
 |-- date_id: long (nullable = true)
 |-- language: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- runtime: double (nullable = true)
 |-- cast_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- director_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [12]:
data = spark.read.format("delta").load("s3a://lakehouse/gold/machineData")


In [13]:
data.count()

7368

In [14]:
fact_movies = spark.read.format("delta").load("s3a://lakehouse/gold/fact_movies")
fact_movies.createOrReplaceTempView("fact_movies")
fact_movies.printSchema()

root
 |-- id: integer (nullable = true)
 |-- budget: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- revenue: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: double (nullable = true)
 |-- date_id: long (nullable = true)



In [15]:
dim_movie = spark.read.format("delta").load("s3a://lakehouse/gold/dim_movie")
dim_movie.createOrReplaceTempView("dim_movie")
dim_movie.printSchema()

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- language: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- runtime: double (nullable = true)
 |-- tagline: string (nullable = true)
 |-- status: string (nullable = true)
 |-- homepage: string (nullable = true)



In [16]:
dim_cast = spark.read.format("delta").load("s3a://lakehouse/gold/dim_cast")
dim_cast.createOrReplaceTempView("dim_cast")
dim_cast.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: integer (nullable = true)
 |-- profile_path: string (nullable = true)
 |-- id: integer (nullable = true)



In [17]:
movie_cast = spark.read.format("delta").load("s3a://lakehouse/gold/movie_cast")
movie_cast.createOrReplaceTempView("movie_cast")
movie_cast.printSchema()

root
 |-- cast_id: integer (nullable = true)
 |-- character: string (nullable = true)
 |-- movie_id: long (nullable = true)



In [18]:
dim_crew = spark.read.format("delta").load("s3a://lakehouse/gold/dim_crew")
dim_crew.createOrReplaceTempView("dim_crew")
dim_crew.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: integer (nullable = true)
 |-- profile_path: string (nullable = true)
 |-- id: integer (nullable = true)



In [19]:
movie_crew = spark.read.format("delta").load("s3a://lakehouse/gold/movie_crew")
movie_crew.createOrReplaceTempView("movie_crew")
movie_crew.printSchema()

root
 |-- crew_id: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- department: string (nullable = true)
 |-- movie_id: long (nullable = true)



In [20]:
dim_genre = spark.read.format("delta").load("s3a://lakehouse/gold/dim_genre")
dim_genre.createOrReplaceTempView("dim_genre")
dim_genre.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [21]:
movie_genre = spark.read.format("delta").load("s3a://lakehouse/gold/movie_genre")
movie_genre.createOrReplaceTempView("movie_genre")
movie_genre.printSchema()

root
 |-- genres_id: integer (nullable = true)
 |-- id: integer (nullable = true)



In [22]:
df = spark.sql("""
               SELECT distinct fm.id
               FROM fact_movies as fm
               JOIN movie_crew as mcr on fm.id = mcr.movie_id and mcr.job = 'Director'
               
              """)

# df = spark.sql("""
#                SELECT distinct movie_id
#                FROM movie_crew
               
#               """)

# df = spark.sql("""
#                SELECT distinct id
#                FROM dim_movie
               
#               """)

# df = spark.read.format("parquet").load("s3a://lakehouse/bronze/movies.parquet")

# df = df.filter(
#                                 (col("budget") != "0") &            # loại bỏ dòng có budget là "0" (kiểu string)
#                                 (col("id").isNotNull()) &           # loại bỏ dòng có id là null
#                                 (col("revenue") != 0) &             # loại bỏ dòng có revenue bằng 0
#                                 (col("vote_average") != 0) &        # loại bỏ dòng có vote_average bằng 0
#                                 (col("vote_count") != 0) &          # loại bỏ dòng có vote_count bằng 0
#                                 (col("popularity") != "0") &        # loại bỏ dòng có popularity là "0" (kiểu string)
#                                 (col("release_date").isNotNull()) & # loại bỏ dòng có release_date là null
#                                 (col("runtime") != 0)               # loại bỏ dòng có runtime bằng 0
#                             )
#df = spark.read.format("delta").load("s3a://lakehouse/silver/movies")


In [None]:
df.count()

In [14]:
# df = spark.sql("""
#                SELECT distinct fm.*, dm.language, dm.overview, dm.tagline, dm.runtime, dca.name as cast_name, dcr.name as director_name, dg.name as genre
#                FROM fact_movies as fm
#                JOIN dim_movie as dm on fm.id = dm.id
#                JOIN movie_cast as mca on fm.id = mca.movie_id
#                JOIN dim_cast as dca on dca.id = mca.cast_id
#                JOIN movie_crew as mcr on fm.id = mcr.movie_id and mcr.job = 'Director'
#                JOIN dim_crew as dcr on dcr.id = mcr.crew_id
#                JOIN movie_genre as mg on fm.id = mg.id
#                JOIN dim_genre as dg on dg.id = mg.genres_id
#               """)

In [24]:
# df = spark.sql("""
#     SELECT fm.id, fm.budget, fm.popularity, fm.revenue, 
#            fm.vote_average, fm.vote_count, fm.date_id, 
#            dm.language, dm.overview, dm.tagline, dm.runtime,
#            COLLECT_SET(dca.name) AS cast_names,  -- Gộp diễn viên thành danh sách
#            COLLECT_SET(dcr.name) AS director_names,  -- Gộp đạo diễn thành danh sách
#            COLLECT_SET(dg.name) AS genres  -- Gộp thể loại thành danh sách
#     FROM fact_movies AS fm
#     JOIN dim_movie AS dm ON fm.id = dm.id
#     JOIN movie_cast AS mca ON fm.id = mca.movie_id
#     JOIN dim_cast AS dca ON dca.id = mca.cast_id
#     JOIN movie_crew AS mcr ON fm.id = mcr.movie_id AND mcr.job = 'Director'
#     JOIN dim_crew AS dcr ON dcr.id = mcr.crew_id
#     JOIN movie_genre AS mg ON fm.id = mg.id
#     JOIN dim_genre AS dg ON dg.id = mg.genres_id
#     GROUP BY fm.id, fm.budget, fm.popularity, fm.revenue, 
#              fm.vote_average, fm.vote_count, fm.date_id, 
#              dm.language, dm.overview, dm.tagline, dm.runtime
# """)


In [None]:
df.count()

In [111]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- budget: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- revenue: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: double (nullable = true)
 |-- date_id: integer (nullable = true)
 |-- language: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- runtime: double (nullable = true)
 |-- cast_names: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- director_names: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- genres: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [120]:
#df = df.fillna({"overview": ""})
df_cleaned = df.filter(
    (df.budget > 0) & 
    (df.revenue > 0) & 
    (df.vote_count > 0) & 
    (df.runtime > 0) & 
    (df.language.isNotNull()) & 
    (df.overview.isNotNull()) & 
    (df.tagline.isNotNull()) & 
    (df.cast_names.isNotNull()) & 
    (df.director_names.isNotNull()) & 
    (df.genres.isNotNull())
)


In [106]:
tokenizer = Tokenizer(inputCol="overview", outputCol="overview_tokens")
df = tokenizer.transform(df)


In [107]:
stopword_remover = StopWordsRemover(inputCol="overview_tokens", outputCol="filtered_overview")
df = stopword_remover.transform(df)


In [108]:
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="filtered_overview", outputCol="overview_vector")
model = word2vec.fit(df)
df = model.transform(df)


Py4JJavaError: An error occurred while calling o874.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 725.0 failed 1 times, most recent failure: Lost task 0.0 in stage 725.0 (TID 11188) (07cc574957ad executor driver): org.apache.spark.SparkException: Failed to execute user defined function (Tokenizer$$Lambda$5789/0x0000000802172bd8: (string) => array<string>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:190)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.hashAgg_ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.hashAgg_doAggregateWithKeysOutput_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:197)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.NullPointerException: Cannot invoke "String.toLowerCase()" because "x$1" is null
	at org.apache.spark.ml.feature.Tokenizer.$anonfun$createTransformFunc$1(Tokenizer.scala:40)
	... 22 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.mllib.feature.Word2Vec.learnVocab(Word2Vec.scala:191)
	at org.apache.spark.mllib.feature.Word2Vec.fit(Word2Vec.scala:312)
	at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:182)
	at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:121)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function (Tokenizer$$Lambda$5789/0x0000000802172bd8: (string) => array<string>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:190)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.hashAgg_ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.hashAgg_doAggregateWithKeysOutput_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:197)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: java.lang.NullPointerException
	at org.apache.spark.ml.feature.Tokenizer.$anonfun$createTransformFunc$1(Tokenizer.scala:40)
	... 22 more


In [84]:
df.head(1)

[Row(id=161495, budget=22000000, popularity=3.395867, revenue=12400000.0, vote_average=6.4, vote_count=7.0, date_id=19950301, language="[{'iso_639_1': 'en', 'name': 'English'}]", overview='', tagline=None, runtime=108.0, cast_names=['D. B. Sweeney', 'Peter Falk', 'Jan Rubes', 'Ellen Burstyn', 'Ernie Sabella', 'Julianne Moore'], director_names=['Peter Yates'], genres=['Comedy', 'Drama'], overview_tokens=[''], filtered_overview=[''], overview_vector=DenseVector([0.0028, 0.0023, 0.0007, 0.004, -0.0008, -0.0, -0.003, -0.0012, -0.0001, -0.0042, -0.0042, 0.0012, 0.0004, 0.0047, -0.0027, -0.0017, 0.0019, 0.0008, -0.0035, 0.002, 0.0017, 0.0021, -0.0017, -0.0001, 0.0, -0.0052, -0.0046, -0.0012, -0.0006, 0.002, 0.0018, -0.0049, 0.0008, -0.0021, -0.004, -0.004, -0.0057, 0.0029, 0.0004, -0.0024, 0.0007, 0.0058, -0.0041, 0.004, -0.0019, 0.0039, -0.0012, 0.0039, -0.0003, -0.0024, -0.0008, 0.0039, -0.0013, 0.0045, 0.0008, -0.003, 0.0046, -0.0029, -0.0042, -0.0015, -0.006, 0.0011, -0.0039, -0.0046, -0

In [151]:
dfs = spark.sql("""
        select distinct movie_id from movie_crew
""")
dfs.count()

4023

In [152]:
dfs = spark.sql("""
        select distinct id from fact_movies
""")
dfs.count()

5404

In [154]:
dd = spark.sql("""
    select distinct fact_movies.id from fact_movies join movie_cast on id = movie_id
""")

dd.count()

1524

In [52]:
dfs = spark.sql("""
        select * from movie_cast where movie_id = 223176
""")
dfs.head(10)

[Row(cast_id=238486, character='Alexey Grekov, nicknamed "Greek"', movie_id=223176),
 Row(cast_id=262929, character='Anna', movie_id=223176),
 Row(cast_id=99308, character='Director', movie_id=223176),
 Row(cast_id=1191678, character='Crucian', movie_id=223176),
 Row(cast_id=1208670, character='Lola', movie_id=223176),
 Row(cast_id=1208674, character='', movie_id=223176),
 Row(cast_id=30405, character='Schota', movie_id=223176),
 Row(cast_id=2669793, character='', movie_id=223176),
 Row(cast_id=2669794, character='', movie_id=223176),
 Row(cast_id=663595, character='', movie_id=223176)]