In [0]:
#Libraries management
import dlt
# from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG `workspace`")
spark.sql("USE SCHEMA `imdb_data_analysis`")

## Load FACT_REGIONAL_RELEASE

In [0]:
# =============================================================================
# GOLD LAYER: fact_regional_release
# =============================================================================
# Description: Fact table for title releases across regions and languages
# Source: silver_imdb_title_akas + dim_title + dim_region + dim_language
# Grain: One row per title-region-language-ordering combination
# =============================================================================

from pyspark.sql.functions import (
    col, lit, current_timestamp, upper, lower, trim, coalesce, monotonically_increasing_id
)

@dlt.table(
    name="fact_regional_release",
    comment="Gold layer: Fact table for title releases across regions and languages",
    table_properties={
        "quality": "gold",
        "layer": "gold",
        "domain": "imdb"
    }
)
@dlt.expect_or_drop("valid_title_key", "title_key IS NOT NULL")
def gold_fact_regional_release():
    
    # -------------------------------------------------------------------------
    # Read from Silver layer
    # -------------------------------------------------------------------------
    df = spark.read.table("LIVE.silver_imdb_title_akas")
    
    # -------------------------------------------------------------------------
    # Read dimension tables for key lookups
    # -------------------------------------------------------------------------
    
    # dim_title lookup
    df_title = spark.read.table("LIVE.dim_title").select(
        col("tconst").alias("title_tconst"),
        col("title_key")
    )
    
    # dim_region lookup
    df_region = spark.read.table("LIVE.dim_region").select(
        col("region_code").alias("region_lookup_code"),
        col("region_key")
    )
    
    # dim_language lookup
    df_language = spark.read.table("LIVE.dim_language").select(
        col("language_code").alias("language_lookup_code"),
        col("language_key")
    )
    
    # -------------------------------------------------------------------------
    # Prepare join keys (normalize case)
    # -------------------------------------------------------------------------
    df = df.withColumn("region_upper", upper(trim(col("region"))))
    df = df.withColumn("language_lower", lower(trim(col("language"))))
    
    # -------------------------------------------------------------------------
    # Join with dim_title
    # -------------------------------------------------------------------------
    df = df.join(
        df_title,
        df["titleId"] == df_title["title_tconst"],
        "inner"
    ).drop("title_tconst")
    
    # -------------------------------------------------------------------------
    # Join with dim_region (left join - keep records even if region not found)
    # -------------------------------------------------------------------------
    df = df.join(
        df_region,
        df["region_upper"] == df_region["region_lookup_code"],
        "left"
    ).drop("region_lookup_code")
    
    # -------------------------------------------------------------------------
    # Join with dim_language (left join - keep records even if language not found)
    # -------------------------------------------------------------------------
    df = df.join(
        df_language,
        df["language_lower"] == df_language["language_lookup_code"],
        "left"
    ).drop("language_lookup_code")
    
    # -------------------------------------------------------------------------
    # Handle NULL keys - replace with -1 (Unknown member)
    # -------------------------------------------------------------------------
    df = df.withColumn(
        "region_key",
        coalesce(col("region_key"), lit(-1))
    )
    
    df = df.withColumn(
        "language_key",
        coalesce(col("language_key"), lit(-1))
    )
    
    # -------------------------------------------------------------------------
    # Rename columns
    # -------------------------------------------------------------------------
    df = df.withColumnRenamed("title", "localized_title")
    df = df.withColumnRenamed("types", "release_types")
    df = df.withColumnRenamed("isOriginalTitle", "is_original_title")
    
    # -------------------------------------------------------------------------
    # Generate surrogate key
    # -------------------------------------------------------------------------
    df = df.withColumn(
        "regional_release_key",
        monotonically_increasing_id() + 1
    )
    
    # -------------------------------------------------------------------------
    # Add metadata and audit columns
    # -------------------------------------------------------------------------
    df = df.withColumn("source_system", lit("silver_imdb_title_akas"))
    df = df.withColumn("etl_load_timestamp", current_timestamp())
    
    # -------------------------------------------------------------------------
    # Select final columns in schema order
    # -------------------------------------------------------------------------
    df_final = df.select(
        "regional_release_key",
        "title_key",
        "region_key",
        "language_key",
        "ordering",
        "localized_title",
        "release_types",
        "is_original_title",
        "source_system",
        "etl_load_timestamp"
    )
    
    return df_final

## Load FACT_TITLE_RATINGS

In [0]:
# =============================================================================
# GOLD LAYER: fact_title_ratings
# =============================================================================
# Description: Fact table for title ratings and vote counts
# Source: silver_imdb_title_ratings + dim_title
# Grain: One row per rated title (tconst)
# =============================================================================

from pyspark.sql.functions import (
    col, lit, current_timestamp, monotonically_increasing_id
)

@dlt.table(
    name="fact_title_ratings",
    comment="Gold layer: Fact table for title ratings and vote counts",
    table_properties={
        "quality": "gold",
        "layer": "gold",
        "domain": "imdb"
    }
)
@dlt.expect_or_drop("valid_title_key", "title_key IS NOT NULL")
@dlt.expect_or_drop("valid_rating", "rating IS NOT NULL")
@dlt.expect_or_drop("valid_vote_count", "vote_count IS NOT NULL")
def gold_fact_title_ratings():
    
    # -------------------------------------------------------------------------
    # Read from Silver layer
    # -------------------------------------------------------------------------
    df = spark.read.table("LIVE.silver_imdb_title_ratings")
    
    # -------------------------------------------------------------------------
    # Read dim_title for key lookup
    # -------------------------------------------------------------------------
    df_title = spark.read.table("LIVE.dim_title").select(
        col("tconst").alias("title_tconst"),
        col("title_key")
    )
    
    # -------------------------------------------------------------------------
    # Join with dim_title
    # -------------------------------------------------------------------------
    df = df.join(
        df_title,
        df["tconst"] == df_title["title_tconst"],
        "inner"
    ).drop("title_tconst")
    
    # -------------------------------------------------------------------------
    # Generate surrogate key (starting from 1)
    # -------------------------------------------------------------------------
    df = df.withColumn(
        "title_rating_key",
        monotonically_increasing_id() + 1
    )
    
    # -------------------------------------------------------------------------
    # Add metadata and audit columns
    # -------------------------------------------------------------------------
    df = df.withColumn("source_system", lit("silver_imdb_title_ratings"))
    df = df.withColumn("etl_load_timestamp", current_timestamp())
    
    # -------------------------------------------------------------------------
    # Select final columns in schema order
    # -------------------------------------------------------------------------
    df_final = df.select(
        "title_rating_key",
        "title_key",
        "rating",
        "vote_count",
        "source_system",
        "etl_load_timestamp"
    )
    
    return df_final