In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

## Load BRIDGE_TITLE_GENRE

In [0]:
# =============================================================================
# GOLD LAYER: bridge_title_genre
# =============================================================================
# Description: Bridge table linking titles to genres (many-to-many)
# Source: silver_imdb_title_basics (exploded genre) + dim_title
# Grain: One row per title-genre combination
# =============================================================================

from pyspark.sql.functions import (
    col, lit, current_timestamp, monotonically_increasing_id
)

@dlt.table(
    name="bridge_title_genre",
    comment="Gold layer: Bridge table linking titles to genres",
    table_properties={
        "quality": "gold",
        "layer": "gold",
        "domain": "imdb"
    }
)
@dlt.expect_or_drop("valid_title_key", "title_key IS NOT NULL")
@dlt.expect_or_drop("valid_genre_name", "genre_name IS NOT NULL")
def gold_bridge_title_genre():
    
    # -------------------------------------------------------------------------
    # Read from Silver layer (already has exploded genres)
    # -------------------------------------------------------------------------
    df = spark.read.table("LIVE.silver_imdb_title_basics")
    
    # -------------------------------------------------------------------------
    # Select only tconst and genre, filter out Unknown genres
    # -------------------------------------------------------------------------
    df = df.select("tconst", "genre").distinct()
    
    df = df.filter(
        (col("genre").isNotNull()) &
        (col("genre") != "Unknown") &
        (col("genre") != "")
    )
    
    # -------------------------------------------------------------------------
    # Read dim_title for key lookup
    # -------------------------------------------------------------------------
    df_title = spark.read.table("LIVE.dim_title").select(
        col("tconst").alias("title_tconst"),
        col("title_key")
    )
    
    # -------------------------------------------------------------------------
    # Join with dim_title
    # -------------------------------------------------------------------------
    df = df.join(
        df_title,
        df["tconst"] == df_title["title_tconst"],
        "inner"
    ).drop("title_tconst", "tconst")
    
    # -------------------------------------------------------------------------
    # Rename columns
    # -------------------------------------------------------------------------
    df = df.withColumnRenamed("genre", "genre_name")
    
    # -------------------------------------------------------------------------
    # Generate surrogate key (starting from 1)
    # -------------------------------------------------------------------------
    df = df.withColumn(
        "title_genre_key",
        monotonically_increasing_id() + 1
    )
    
    # -------------------------------------------------------------------------
    # Add metadata and audit columns
    # -------------------------------------------------------------------------
    df = df.withColumn("source_system", lit("silver_imdb_title_basics"))
    df = df.withColumn("etl_load_timestamp", current_timestamp())
    
    # -------------------------------------------------------------------------
    # Select final columns in schema order
    # -------------------------------------------------------------------------
    df_final = df.select(
        "title_genre_key",
        "title_key",
        "genre_name",
        "source_system",
        "etl_load_timestamp"
    )
    
    return df_final

## Load BRIDGE_TITLE_CHARACTERS

In [0]:
# =============================================================================
# GOLD LAYER: bridge_title_characters
# =============================================================================
# Description: Bridge table linking titles to characters played by actors
# Source: silver_imdb_title_principals (filtered for character data only)
# Grain: One row per title-person-character combination
# =============================================================================

from pyspark.sql.functions import (
    col, lit, current_timestamp, monotonically_increasing_id
)

@dlt.table(
    name="bridge_title_characters",
    comment="Gold layer: Bridge table linking titles to characters played by actors",
    table_properties={
        "quality": "gold",
        "layer": "gold",
        "domain": "imdb"
    }
)
@dlt.expect_or_drop("valid_title_key", "title_key IS NOT NULL")
@dlt.expect_or_drop("valid_person_key", "person_key IS NOT NULL")
@dlt.expect_or_drop("valid_characters", "characters IS NOT NULL")
def gold_bridge_title_characters():
    
    # -------------------------------------------------------------------------
    # Read dimension tables for key lookups
    # -------------------------------------------------------------------------
    df_title = spark.read.table("LIVE.dim_title").select(
        col("tconst").alias("title_tconst"),
        col("title_key")
    )
    
    df_person = spark.read.table("LIVE.dim_person").select(
        col("nconst").alias("person_nconst"),
        col("person_key")
    )
    
    # -------------------------------------------------------------------------
    # Read from Silver layer
    # -------------------------------------------------------------------------
    df = spark.read.table("LIVE.silver_imdb_title_principals")
    
    # -------------------------------------------------------------------------
    # Filter: Only rows with character information
    # -------------------------------------------------------------------------
    df = df.filter(
        (col("characters").isNotNull()) &
        (col("characters") != "Unknown") &
        (col("characters") != "")
    )
    
    # -------------------------------------------------------------------------
    # Join with dim_title
    # -------------------------------------------------------------------------
    df = df.join(
        df_title,
        df["tconst"] == df_title["title_tconst"],
        "inner"
    ).drop("title_tconst", "tconst")
    
    # -------------------------------------------------------------------------
    # Join with dim_person
    # -------------------------------------------------------------------------
    df = df.join(
        df_person,
        df["nconst"] == df_person["person_nconst"],
        "inner"
    ).drop("person_nconst", "nconst")
    
    # -------------------------------------------------------------------------
    # Generate surrogate key (starting from 1)
    # -------------------------------------------------------------------------
    df = df.withColumn(
        "title_character_key",
        monotonically_increasing_id() + 1
    )
    
    # -------------------------------------------------------------------------
    # Add metadata and audit columns
    # -------------------------------------------------------------------------
    df = df.withColumn("source_system", lit("silver_imdb_title_principals"))
    df = df.withColumn("etl_load_timestamp", current_timestamp())
    
    # -------------------------------------------------------------------------
    # Select final columns in schema order
    # -------------------------------------------------------------------------
    df_final = df.select(
        "title_character_key",
        "title_key",
        "person_key",
        "ordering",
        "category",
        "characters",
        "source_system",
        "etl_load_timestamp"
    )
    
    return df_final

## Load BRIDGE_PERSON_PROFESSION

In [0]:
# =============================================================================
# GOLD LAYER: bridge_person_profession
# =============================================================================
# Description: Bridge table linking persons to their professions on titles
# Source: silver_imdb_title_principals + silver_imdb_title_crew + dim_title + dim_person
# Grain: One row per person-title-profession combination
# =============================================================================

from pyspark.sql.functions import (
    col, lit, current_timestamp, monotonically_increasing_id, row_number
)
from pyspark.sql.window import Window

@dlt.table(
    name="bridge_person_profession",
    comment="Gold layer: Bridge table linking persons to their professions on titles",
    table_properties={
        "quality": "gold",
        "layer": "gold",
        "domain": "imdb"
    }
)
@dlt.expect_or_drop("valid_person_key", "person_key IS NOT NULL")
@dlt.expect_or_drop("valid_title_key", "title_key IS NOT NULL")
@dlt.expect_or_drop("valid_profession_name", "profession_name IS NOT NULL")
def gold_bridge_person_profession():
    
    # -------------------------------------------------------------------------
    # Read dimension tables for key lookups
    # -------------------------------------------------------------------------
    df_title = spark.read.table("LIVE.dim_title").select(
        col("tconst").alias("title_tconst"),
        col("title_key")
    )
    
    df_person = spark.read.table("LIVE.dim_person").select(
        col("nconst").alias("person_nconst"),
        col("person_key")
    )
    
    # -------------------------------------------------------------------------
    # Source 1: title.principals (has job details)
    # -------------------------------------------------------------------------
    df_principals = spark.read.table("LIVE.silver_imdb_title_principals")
    
    df_principals = df_principals.select(
        col("tconst"),
        col("nconst"),
        col("category").alias("profession"),
        col("job"),
        lit(1).alias("source_priority")  # Higher priority (has job info)
    )
    
    # -------------------------------------------------------------------------
    # Source 2: title.crew (directors and writers only)
    # -------------------------------------------------------------------------
    df_crew = spark.read.table("LIVE.silver_imdb_title_crew")
    
    df_crew = df_crew.select(
        col("tconst"),
        col("nconst"),
        col("role").alias("profession"),
        lit("Unknown").alias("job"),
        lit(0).alias("source_priority")  # Lower priority (no job info)
    )
    
    # -------------------------------------------------------------------------
    # Union both sources
    # -------------------------------------------------------------------------
    df_merged = df_principals.unionByName(df_crew)
    
    # -------------------------------------------------------------------------
    # Deduplicate: Keep row with job info (principals) over Unknown (crew)
    # -------------------------------------------------------------------------
    window_spec = Window.partitionBy("tconst", "nconst", "profession").orderBy(col("source_priority").desc())
    
    df_dedup = df_merged.withColumn("row_num", row_number().over(window_spec))
    df_dedup = df_dedup.filter(col("row_num") == 1).drop("row_num", "source_priority")
    
    # -------------------------------------------------------------------------
    # Join with dim_title
    # -------------------------------------------------------------------------
    df_dedup = df_dedup.join(
        df_title,
        df_dedup["tconst"] == df_title["title_tconst"],
        "inner"
    ).drop("title_tconst", "tconst")
    
    # -------------------------------------------------------------------------
    # Join with dim_person
    # -------------------------------------------------------------------------
    df_dedup = df_dedup.join(
        df_person,
        df_dedup["nconst"] == df_person["person_nconst"],
        "inner"
    ).drop("person_nconst", "nconst")
    
    # -------------------------------------------------------------------------
    # Rename columns
    # -------------------------------------------------------------------------
    df_dedup = df_dedup.withColumnRenamed("profession", "profession_name")
    
    # -------------------------------------------------------------------------
    # Generate surrogate key (starting from 1)
    # -------------------------------------------------------------------------
    df_dedup = df_dedup.withColumn(
        "person_profession_key",
        monotonically_increasing_id() + 1
    )
    
    # -------------------------------------------------------------------------
    # Add metadata and audit columns
    # -------------------------------------------------------------------------
    df_dedup = df_dedup.withColumn("source_system", lit("silver_imdb_title_principals,silver_imdb_title_crew"))
    df_dedup = df_dedup.withColumn("etl_load_timestamp", current_timestamp())
    
    # -------------------------------------------------------------------------
    # Select final columns in schema order
    # -------------------------------------------------------------------------
    df_final = df_dedup.select(
        "person_profession_key",
        "person_key",
        "title_key",
        "profession_name",
        "job",
        "source_system",
        "etl_load_timestamp"
    )
    
    return df_final