In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

In [0]:
# Environment Setup
catalog = "workspace"
schema = "imdb_data_analysis"
volume = "datastore"
file_name = "title.principals"

# Paths
path_volume = f"/Volumes/{catalog}/{schema}/{volume}/{file_name}"
volume_path = f"{path_volume}/{file_name}.tsv"

In [0]:
@dlt.table(
    name="bronze_imdb_title_principals",
    comment="Bronze layer: Raw IMDB mapping Titles to Crew and Job Roles data ingested from TSV files in Unity Catalog Volume"
)
def bronze_imdb_title_principals():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("delimiter", "\t")
            .option("header", "true")
            .option("cloudFiles.schemaLocation", path_volume)
            .load(path_volume)
            .withColumn("bronze_load_timestamp", current_timestamp())
            .withColumn("bronze_load_date", current_date())
            .withColumn("source_system", lit("IMDB_TSV"))
    )

In [0]:
@dlt.table(
    name="silver_imdb_title_principals",
    comment="Silver layer: IMDB Titles to Crew Mappings with job roles data ingested from Bronze Layer"
)
@dlt.expect_or_drop("tconst_check", "tconst IS NOT NULL")
@dlt.expect_or_drop("nconst_check", "nconst IS NOT NULL")
@dlt.expect("ordering_check", "CAST(ordering AS INT) > 0")
def silver_imdb_title_principals():
    df = dlt.read_stream("bronze_imdb_title_principals")
    df = df.withColumn("ordering", col("ordering").cast("int"))
    df = df.withColumn("characters", regexp_extract(col("characters"), '\\["(.+)"\\]', 1))
    df = df.withColumn("characters", when(col("characters") == "", "Unknown").otherwise(col("characters")))

    df = df.select("tconst", "ordering", "nconst", "category", "characters")
    df = df.withColumn("silver_load_timestamp", current_timestamp())
    df = df.withColumn("source_system", lit("bronze_imdb_title_principals"))    

    return df