# 01_serving_build_gold_star_schema_job

Este job realiza a construção do esquema estrela da camada **Gold**, materializando as tabelas dimensionais e fato a partir do dataset `flights_aggregated.parquet`.


In [20]:
# Parameters

run_mode = "latest"
run_date = None

gold_path = "/opt/airflow/data-layer/gold"
aggregated_name = "flights_aggregated.parquet"


In [19]:
import os
from datetime import datetime
from pathlib import Path
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

from transformer.utils.spark_helpers import get_spark_session
from transformer.utils.file_io import find_partition
from transformer.utils.logger import get_logger
from transformer.validation.quality_gates_gold import run_quality_gates_gold

log = get_logger("build_gold_star_schema")


spark = get_spark_session("BuildGoldStarSchema")
log.info("[Serving][Materialize] SparkSession iniciada.")


2025-11-09 03:50:40 [INFO] build_gold_star_schema | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-09 03:50:40 [INFO] spark_helpers | [INFO] SparkSession criada: 'BuildGoldStarSchema' (master=local[*])
2025-11-09 03:50:40 [INFO] build_gold_star_schema | [Serving] SparkSession iniciada.


In [21]:
def materialize_gold_layer(df):
    """
    Gera as tabelas dimensionais e fato da camada gold a partir do dataset agregado.

    Args:
        df (DataFrame): DataFrame consolidado da camada gold.

    Returns:
        dict[str, DataFrame]: DataFrames correspondentes a dim_airline, dim_airport, dim_date e fato_flights.
    """
    # Feriados federais nos EUA em 2015 (UTC)
    us_holidays_2015 = [
        "2015-01-01",
        "2015-01-19",
        "2015-02-16",
        "2015-05-25",
        "2015-07-04",
        "2015-09-07",
        "2015-10-12",
        "2015-11-11",
        "2015-11-26",
        "2015-12-25",
    ]
    holidays_df = (
        spark.createDataFrame([(d,) for d in us_holidays_2015], ["holiday_date"])
            .withColumn("holiday_date", F.col("holiday_date").cast(DateType()))
    )
    
    log.info("[Serving][Materialize] Iniciando materialização da camada gold.")

    # dim_airline
    log.info("[Serving][Materialize] Materializando 'dim_airline'.")
    dim_airline = (
        df.select("airline_iata_code", "airline_name")
            .distinct()
            .withColumn("airline_id", F.monotonically_increasing_id())
            .select("airline_id", "airline_iata_code", "airline_name")
    )

    # dim_airport
    log.info("[Serving][Materialize] Materializando 'dim_airport'.")
    dim_airport = (
        df.select(
            F.col("origin_airport_iata_code").alias("airport_iata_code"),
            F.col("origin_airport_name").alias("airport_name"),
            F.col("origin_city").alias("city_name"),
            F.col("origin_state").alias("state_code"),
            F.col("origin_latitude").alias("latitude"),
            F.col("origin_longitude").alias("longitude")
        )
            .union(
                df.select(
                    F.col("dest_airport_iata_code").alias("airport_iata_code"),
                    F.col("dest_airport_name").alias("airport_name"),
                    F.col("dest_city").alias("city_name"),
                    F.col("dest_state").alias("state_code"),
                    F.col("dest_latitude").alias("latitude"),
                    F.col("dest_longitude").alias("longitude")
                )
            )
            .distinct()
            .withColumn("airport_id", F.monotonically_increasing_id())
            .select("airport_id", "airport_iata_code", "airport_name", "city_name",
                    "state_code", "latitude", "longitude")
    )

    # dim_date
    log.info("[Serving][Materialize] Materializando 'dim_date'.")
    dim_date = (
        df.select(F.col("flight_date").alias("full_date"))
            .distinct()
            .withColumn("year", F.year("full_date"))
            .withColumn("month", F.month("full_date"))
            .withColumn("day", F.dayofmonth("full_date"))
            .withColumn("day_of_week", F.dayofweek("full_date"))
            .withColumn("quarter", F.quarter("full_date"))
            # Augmentation de feriado
            .join(holidays_df, F.col("full_date") == F.col("holiday_date"), "left")
            .withColumn("is_holiday", F.when(F.col("holiday_date").isNotNull(), F.lit(True)).otherwise(F.lit(False)))
            .drop("holiday_date")
            # Fim do augmentation de feriado
            .select("full_date", "year", "month", "day", "day_of_week", "quarter", "is_holiday")
    )

    # fato_flights
    log.info("[Serving][Materialize] Materializando 'fato_flights'.")
    fato_flights = (
        df.withColumn("flight_id", F.monotonically_increasing_id())
            .withColumnRenamed("flight_date", "full_date")
            .select(
                "flight_id", "full_date", "airline_iata_code",
                "origin_airport_iata_code", "dest_airport_iata_code",
                "distance",
                "air_time", "elapsed_time", "scheduled_time",
                "taxi_out", "taxi_in",
                "departure_delay", "arrival_delay", "air_system_delay", 
                "security_delay", "airline_delay", "late_aircraft_delay", "weather_delay"
            )
    )

    # Joins para adicionar fk's
    log.info("[Serving][Materialize] Adicionando fk's na 'fato_flights'.")
    fato_flights = (
        fato_flights
            .join(dim_airline, on="airline_iata_code", how="left")
            .join(dim_airport.select(
                F.col("airport_id").alias("origin_airport_id"),
                F.col("airport_iata_code").alias("origin_airport_iata_code")
            ), on="origin_airport_iata_code", how="left")
            .join(dim_airport.select(
                F.col("airport_id").alias("dest_airport_id"),
                F.col("airport_iata_code").alias("dest_airport_iata_code")
            ), on="dest_airport_iata_code", how="left")
            .select(
                "flight_id", "full_date", "airline_id", "origin_airport_id", "dest_airport_id",
                "distance", "air_time", "elapsed_time", "scheduled_time", "taxi_out",
                "taxi_in", "departure_delay", "arrival_delay",
                "air_system_delay", "security_delay", "airline_delay",
                "late_aircraft_delay", "weather_delay"
            )
    )

    log.info("[Serving][Materialize] Materialização concluída.")
    return {
        "dim_airline": dim_airline,
        "dim_airport": dim_airport,
        "dim_date": dim_date,
        "fato_flights": fato_flights
    }


In [None]:
try:
    log.info("[Serving][Materialize] Iniciando job de materialização da gold.")

    # Localiza partição de entrada
    source_partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
    source_dir = Path(gold_path) / source_partition / "PARQUET" / aggregated_name

    if not source_dir.exists():
        raise FileNotFoundError(f"[Serving][Materialize][ERROR] Arquivo agregado não encontrado: {source_dir}")

    # Lê dataset agregado
    df = spark.read.parquet(str(source_dir))
    log.info(f"[Serving][Materialize] Dataset carregado de {source_dir}")

    # Realiza a materialização
    tables = materialize_gold_layer(df)
    dim_airline = tables["dim_airline"]
    dim_airport = tables["dim_airport"]
    dim_date = tables["dim_date"]
    fato_flights = tables["fato_flights"]

    # Executa quality gate
    log.info("[Serving][Materialize] Iniciando quality gate.")
    run_quality_gates_gold(
        dim_airline=dim_airline,
        dim_airport=dim_airport,
        dim_date=dim_date,
        fato_flights=fato_flights
    )
    log.info("[Serving][Materialize] Quality gate concluído com sucesso.")

    # Define partição de saída
    processing_date = datetime.now().strftime("%Y-%m-%d")
    output_dir = Path(gold_path) / processing_date / "PARQUET"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Escrita dos arquivos na Gold
    log.info("[Serving][Materialize] Iniciando escrita das tabelas na camada gold em formato Parquet.")

    dim_airline.write.mode("overwrite").parquet(str(output_dir / "dim_airline.parquet"))
    dim_airport.write.mode("overwrite").parquet(str(output_dir / "dim_airport.parquet"))
    dim_date.write.mode("overwrite").parquet(str(output_dir / "dim_date.parquet"))
    fato_flights.write.mode("overwrite").parquet(str(output_dir / "fato_flights.parquet"))

    log.info("[Serving][Materialize] Escrita concluída com sucesso.")

except Exception as e:
    log.exception(f"[Serving][Materialize][ERROR] Falha na construção do esquema estrela: {e}.")
    raise

finally:
    log.info("[Serving][Materialize] Fim do job de materialização da gold.")
    

In [32]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.
# Visualização dos DataFrames
df_show = {
    "dim_airline": dim_airline,
    "dim_airport": dim_airport,
    "dim_date": dim_date,
    "fato_flights": fato_flights
}

for name, d in df_show.items():
    print(f"\n+---------------------------------------- {name} ------------------------------------------+")
    d.printSchema()
    d.limit(1).show(truncate=True)



+---------------------------------------- dim_airline ------------------------------------------+
root
 |-- airline_id: long (nullable = false)
 |-- airline_iata_code: string (nullable = true)
 |-- airline_name: string (nullable = true)

+----------+-----------------+---------------+
|airline_id|airline_iata_code|   airline_name|
+----------+-----------------+---------------+
|         0|               B6|JetBlue Airways|
+----------+-----------------+---------------+


+---------------------------------------- dim_airport ------------------------------------------+
root
 |-- airport_id: long (nullable = false)
 |-- airport_iata_code: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



                                                                                

+----------+-----------------+--------------------+---------+----------+--------+----------+
|airport_id|airport_iata_code|        airport_name|city_name|state_code|latitude| longitude|
+----------+-----------------+--------------------+---------+----------+--------+----------+
|         0|              OAK|Oakland Internati...|  Oakland|        CA|37.72129|-122.22072|
+----------+-----------------+--------------------+---------+----------+--------+----------+


+---------------------------------------- dim_date ------------------------------------------+
root
 |-- full_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- is_holiday: boolean (nullable = false)

+----------+----+-----+---+-----------+-------+----------+
| full_date|year|month|day|day_of_week|quarter|is_holiday|
+----------+----+-----+---+---------



+-----------+----------+----------+-----------------+---------------+--------+--------+------------+--------------+--------+-------+---------------+-------------+----------------+--------------+-------------+-------------------+-------------+
|  flight_id| full_date|airline_id|origin_airport_id|dest_airport_id|distance|air_time|elapsed_time|scheduled_time|taxi_out|taxi_in|departure_delay|arrival_delay|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|
+-----------+----------+----------+-----------------+---------------+--------+--------+------------+--------------+--------+-------+---------------+-------------+----------------+--------------+-------------+-------------------+-------------+
|17179869184|2015-11-01|         3|              130|            242|  1299.0|   154.0|       177.0|         177.0|    17.0|    6.0|            2.0|          2.0|             0.0|           0.0|          0.0|                0.0|          0.0|
+-----------+----------+----

                                                                                

In [33]:
# Encerrando a sessão do Spark.
spark.stop()
log.info("[Serving][Materialize] Sessão Spark finalizada.")


2025-11-09 04:23:09 [INFO] build_gold_star_schema | [Serving] Sessão Spark finalizada.
