In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, DateType
from pyspark.sql.functions import (
    col, from_json, explode, to_date, date_format,
    dayofweek, dayofmonth, dayofyear, weekofyear,
    month, quarter, year, when, unix_timestamp
)
from delta.tables import DeltaTable
from pyspark.sql.types import MapType
from pyspark.sql.functions import from_json, col, explode, map_keys, map_values, expr
from pyspark.sql.types import MapType, StringType


In [2]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .getOrCreate()

In [4]:
df_dim_crew = spark.read.format("delta").load("s3a://lakehouse/silver/credit")
crew_schema = ArrayType(
    StructType([
        StructField("credit_id", StringType(), True),
        StructField("department", StringType(), True),
        StructField("gender", IntegerType(), True),
        StructField("id", IntegerType(), True),
        StructField("job", StringType(), True),
        StructField("name", StringType(), True),
        StructField("profile_path", StringType(), True)
    ])
)


df_parsed = df_dim_cast.withColumn("cast", from_json(col("cast"), cast_schema))

# Bước 2: Chuyển map thành array trước khi explode
df_exploded = df_parsed.withColumn("crew", expr("map_entries(crew)"))

# Bước 3: Explode để tách từng cặp key-value ra thành hàng riêng biệt
df_exploded = df_exploded.withColumn("crew_key", col("crew.key")).withColumn("crew_value", col("crew.value")).drop("crew")

# Chọn các trường cần thiết


In [6]:
df_exploded.count()

42708

In [7]:
df_parsed.count()

42708

In [5]:
df_dim_crew.count()

42708

In [8]:
df_parsed.printSchema()


root
 |-- cast: string (nullable = true)
 |-- crew: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- id: long (nullable = true)



In [9]:
df_exploded.printSchema()

root
 |-- cast: string (nullable = true)
 |-- id: long (nullable = true)
 |-- crew_key: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- crew_value: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [48]:
df = spark.read.format("delta").load("s3a://lakehouse/gold/dim_crew")

In [10]:
df_dim_crew.printSchema()


root
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- id: long (nullable = true)



In [None]:
df.head(50)

In [5]:
fact_movie_df = df.select(
    col("id"),
    col("budget"),
    col("popularity"),
    col("revenue"),
    col("vote_average"),
    col("vote_count"),
    col("date_id")
).dropDuplicates(["id"])

try:

    fact_movie = DeltaTable.forPath(spark, "s3a://lakehouse/gold/fact_movies")

    fact_movie.alias("target").merge(
        fact_movie_df.alias("source"),
        "target.id = source.id"
    ).whenNotMatchedInsertAll().execute()
except :
    fact_movie_df.write.format("delta").mode("overwrite").save("s3a://lakehouse/gold/fact_movies")
