In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

In [0]:
# Environment Setup
catalog = "workspace"
schema = "imdb_data_analysis"
volume = "datastore"
file_name = "title.crew"

# Paths
path_volume = f"/Volumes/{catalog}/{schema}/{volume}/{file_name}"
volume_path = f"{path_volume}/{file_name}.tsv"

In [0]:
@dlt.table(
    name="bronze_imdb_title_crew",
    comment="Bronze layer: Raw IMDB Title director and writers data ingested from TSV files in Unity Catalog Volume"
)
def bronze_imdb_title_crew():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("delimiter", "\t")
            .option("header", "true")
            .option("cloudFiles.schemaLocation", path_volume)
            .load(path_volume)
            .withColumn("bronze_load_timestamp", current_timestamp())
            # .withColumn("bronze_load_date", current_date())
            .withColumn("source_system", lit("IMDB_TSV"))
    )

In [0]:
@dlt.table(
    name="silver_imdb_title_crew",
    comment="Silver layer: IMDB Titles Director and Writers data - exploded to bridge table format"
)
@dlt.expect_or_drop("tconst_check", "tconst IS NOT NULL")
def silver_imdb_title_crew():
    df = dlt.read_stream("bronze_imdb_title_crew")

    # Directors - filter out \N before exploding
    directors_df = df.filter(col("directors") != "\\N").select(
        col("tconst"),
        explode(split(col("directors"), ",")).alias("nconst"),
        lit("director").alias("role")
    )

    # Writers - filter out \N before exploding
    writers_df = df.filter(col("writers") != "\\N").select(
        col("tconst"),
        explode(split(col("writers"), ",")).alias("nconst"),
        lit("writer").alias("role")
    )

    df = directors_df.unionByName(writers_df)

    df = df.withColumn("silver_load_timestamp", current_timestamp())
    df = df.withColumn("source_system", lit("bronze_imdb_title_crew"))    
    
    return df