In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

In [0]:
# Environment Setup
catalog = "workspace"
schema = "imdb_data_analysis"
volume = "datastore"
file_name = "title.episode"

# Paths
path_volume = f"/Volumes/{catalog}/{schema}/{volume}/{file_name}"
volume_path = f"{path_volume}/{file_name}.tsv"

In [0]:
@dlt.table(
    name="bronze_imdb_title_episode",
    comment="Bronze layer: Raw IMDB TV Series Titles mapped to Season and Episode Information data ingested from TSV files in Unity Catalog Volume"
)
def bronze_imdb_title_episode():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("delimiter", "\t")
            .option("header", "true")
            .option("cloudFiles.schemaLocation", path_volume)
            .load(path_volume)
            .withColumn("bronze_load_timestamp", current_timestamp())
            # .withColumn("bronze_load_date", current_date())
            .withColumn("source_system", lit("IMDB_TSV"))
    )

In [0]:
@dlt.table(
    name="silver_imdb_title_episode",
    comment="Silver layer: IMDB Titles ID and parent ID for TV Series Titles Season and Episode data ingested from Bronze Layer"
)
@dlt.expect_or_drop("tconst_check", "tconst IS NOT NULL")
@dlt.expect_or_drop("parenttconst_check", "parentTconst IS NOT NULL")
def silver_imdb_title_episode():
    df = dlt.read_stream("bronze_imdb_title_episode")

    df = df.withColumn("seasonNumber", when(col("seasonNumber") == "\\N", -1).otherwise(col("seasonNumber")).cast("int"))
    df = df.withColumn("episodeNumber", when(col("episodeNumber") == "\\N", -1).otherwise(col("episodeNumber")).cast("int"))

    df = df.select("tconst", "parentTconst", "seasonNumber", "episodeNumber")

    df = df.withColumn("silver_load_timestamp", current_timestamp())
    df = df.withColumn("source_system", lit("bronze_imdb_title_episode"))    

    return df

