In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

In [0]:
# Environment Setup
catalog = "workspace"
schema = "imdb_data_analysis"
volume = "datastore"
file_name = "title.basics"

# Paths
path_volume = f"/Volumes/{catalog}/{schema}/{volume}/{file_name}"
volume_path = f"{path_volume}/{file_name}.tsv"

In [0]:
@dlt.table(
    name="bronze_imdb_title_basics",
    comment="Bronze layer: Raw IMDB Titles ingested from TSV files in Unity Catalog Volume"
)
def bronze_imdb_title_basics():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("delimiter", "\t")
            .option("header", "true")
            .option("quote", "")  # Disable quote parsing
            .option("escape", "")  # Disable escape character
            .option("multiLine", "false")
            .option("mode", "PERMISSIVE")  # Keep malformed rows
            .option("columnNameOfCorruptRecord", "_corrupt_record")  # Capture bad rows
            .option("cloudFiles.schemaLocation", path_volume + "/_schema")
            .load(path_volume)
            .withColumn("bronze_load_timestamp", current_timestamp())
            .withColumn("source_system", lit("IMDB_TSV"))
    )

In [0]:
@dlt.table(
    name="silver_imdb_title_basics",
    comment="Silver layer: Raw IMDB Titles data streamed from Bronze layer, clean and transform genres data"
)
@dlt.expect_or_drop("tconst_check", "tconst IS NOT NULL")
@dlt.expect_or_drop("type_check", "titleType IS NOT NULL")
def silver_imdb_title_basics():
    df = dlt.read_stream("bronze_imdb_title_basics")
    df = df.withColumn("title_category", when(col("titleType").startswith("tv"), "TV").otherwise("Movie"))
    # df = df.withColumn("isAdult", when(df.isAdult.isin(["0", "1"]), col("isAdult")).otherwise(lit("-1")).cast("int"))
    df = df.withColumn("isAdult", 
        when(col("isAdult") == "1", lit(1))
        .when(col("isAdult") == "0", lit(0))
        .when(col("isAdult") == "\\N", lit(-1))
        .otherwise(lit(-1))
    )
    df = df.withColumn("startYear", when(col("startYear") == "\\N", 9999).otherwise(col("startYear")).cast("int"))
    df = df.withColumn("endYear", when(col("endYear") == "\\N", 9999).otherwise(col("endYear")).cast("int"))
    df = df.withColumn("runtimeMinutes", when(col("runtimeMinutes") == "\\N", -1).otherwise(col("runtimeMinutes")).cast("int"))
    df = df.withColumn("genres", when(col("genres") == "\\N", "Unknown").otherwise(col("genres")))    
    df = df.withColumn("genre", explode(split(col("genres"), ",")))
    df = df.select("tconst", "title_category", "titleType", "primaryTitle", "originalTitle", "isAdult", "startYear", "endYear", "runtimeMinutes", "genre")
    df = df.withColumn("silver_load_timestamp", current_timestamp())
    df = df.withColumn("source_system", lit("bronze_imdb_title_basics"))
    return df

