In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

In [0]:
# Environment Setup
catalog = "workspace"
schema = "imdb_data_analysis"
volume = "datastore"
file_name = "title.akas"

# Paths
path_volume = f"/Volumes/{catalog}/{schema}/{volume}/{file_name}"
volume_path = f"{path_volume}/{file_name}.tsv"

In [0]:
@dlt.table(
    name="bronze_imdb_title_akas",
    comment="Bronze layer: Raw IMDB Titles & its region, language, channels ingested from TSV files in Unity Catalog Volume"
)
def bronze_imdb_title_akas():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("delimiter", "\t")
            .option("header", "true")
            .option("cloudFiles.schemaLocation", path_volume)
            .load(path_volume)
            .withColumn("bronze_load_timestamp", current_timestamp())
            .withColumn("bronze_load_date", current_date())
            .withColumn("source_system", lit("IMDB_TSV"))
    )

In [0]:
@dlt.table(
    name="silver_imdb_title_akas",
    comment="Silver layer: IMDB Titles & its region, language, channels data streamed from Bronze layer, clean and transform"
)
@dlt.expect_or_drop("titleId_check", "titleId IS NOT NULL")
@dlt.expect_or_drop("ordering_check", "CAST(ordering AS INT) > 0")
def silver_imdb_title_akas():
    df = dlt.read_stream("bronze_imdb_title_akas")
    df = df.withColumn("ordering", df.ordering.cast(IntegerType()))
    df = df.withColumn("region", when(col("region") == "\\N", "Unknown").otherwise(col("region")))
    df = df.withColumn("region", when(col("region").isNull(), "Unknown").otherwise(col("region")))
    df = df.withColumn("language", when(col("language") == "\\N", "Unknown").otherwise(col("language")))
    df = df.withColumn("language", when(col("language").isNull(), "Unknown").otherwise(col("language")))
    df = df.withColumn("types", when(col("types") == "\\N", "Unknown").otherwise(col("types")))
    df = df.withColumn("types", when(col("types").isNull(), "Unknown").otherwise(col("types")))
    df = df.withColumn("isOriginalTitle", when(col("isOriginalTitle").isNull(), "-1").otherwise(col("isOriginalTitle")).cast("int"))
    df = df.withColumn("silver_load_timestamp", current_timestamp())
    df = df.withColumn("source_system", lit("silver_imdb_title_akas"))    
    return df

