In [1]:
# Parameters
run_mode = "latest"
run_date = None
silver_path = "/opt/airflow/data-layer/silver"
gold_path = "/opt/airflow/data-layer/gold"
aggregated_name = "flights_aggregated.parquet"
postgres_conn_id = "dw"


# etl_silver_to_gold
---
Este notebook executa o processo `ETL` que transfere os dados da camada **Silver** para a **Gold**, englobando normalização, movimentação dos arquivos e carga dos dados no *PostgreSQL*, dando finalidade ao pipeline.


In [2]:
# Parameters

run_mode = "latest"
run_date = None

silver_path = "/opt/airflow/data-layer/silver"
gold_path = "/opt/airflow/data-layer/gold"

aggregated_name = "flights_aggregated.parquet"
postgres_conn_id = "AIRFLOW_VAR_POSTGRES_CONN_ID"


In [3]:
import os
import shutil
from datetime import datetime
from pathlib import Path

from transformer.utils.file_io import find_partition
from transformer.utils.logger import get_logger
from transformer.utils.spark_helpers import get_spark_session, load_to_postgres, read_from_postgres
from transformer.utils.postgre_helpers import assert_table_rowcount
from transformer.utils.quality_gates_gold import run_quality_gates_gold

from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import DateType
from pyspark.sql import SparkSession


## Job 1: build_and_load_gold_star_schema

Este job realiza a construção do esquema estrela da camada **Gold**, materializando as tabelas dimensionais e fato a partir da tabela `silver_flights`, salva os dados em formato `parquet` na camada **Gold** e carregando os dados no *PostgreSQL* de acordo com o ddl da camada.


In [4]:
log = get_logger("build_and_load_gold_star_schema")

spark = get_spark_session("BuildLoadGoldStarSchema")
log.info("[BuildLoad] SparkSession iniciada.")


/usr/local/lib/python3.12/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9401ddb0-38e6-443d-8c08-8d76ec8d311a;1.0
	confs: [default]


	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 466ms :: artifacts dl 22ms
	:: modules in use:
	org.checkerframework#checker-qual;3.42.0 from central in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-9401ddb0-38e6-443d-8c08-8d76ec8d311a
	confs: [default]
	0 artifacts copied, 2 already retrieved (0kB/25ms)


25/11/27 03:15:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2025-11-27 03:16:02 [INFO] spark_helpers: [INFO] SparkSession criada: 'BuildLoadGoldStarSchema' (master=local[*]).


2025-11-27 03:16:02 [INFO] build_and_load_gold_star_schema: [BuildLoad] SparkSession iniciada.


### Definindo função de materialização

In [5]:
def materialize_gold_layer(df: DataFrame) -> dict[str, DataFrame]:
    """
    Materializa as tabelas dimensionais e fato da camada gold a partir do DataFrame agregado.

    Args:
        df (DataFrame): DataFrame consolidado da camada gold.

    Returns:
        dict[str, DataFrame]: DataFrames correspondentes a dim_air, dim_apt, dim_dat e fat_flt.
    """
    # Feriados federais nos EUA em 2015 (UTC)
    us_holidays_2015 = [
        "2015-01-01",
        "2015-01-19",
        "2015-02-16",
        "2015-05-25",
        "2015-07-04",
        "2015-09-07",
        "2015-10-12",
        "2015-11-11",
        "2015-11-26",
        "2015-12-25",
    ]
    holidays_df = (
        spark.createDataFrame([(d,) for d in us_holidays_2015], ["holiday_date"])
            .withColumn("holiday_date", F.col("holiday_date").cast(DateType()))
    )
    
    log.info("[Materialize] Iniciando materialização da camada gold.")

    # dim_air
    log.info("[Materialize] Materializando 'dim_air'.")
    dim_air = (
        df.select("airline_iata_code", "airline_name")
            .distinct()
            .withColumn("airline_id", F.monotonically_increasing_id())
            .select("airline_id", "airline_iata_code", "airline_name")
    )

    # dim_apt
    log.info("[Materialize] Materializando 'dim_apt'.")
    dim_apt = (
        df.select(
            F.col("origin_airport_iata_code").alias("airport_iata_code"),
            F.col("origin_airport_name").alias("airport_name"),
            F.col("origin_city").alias("city_name"),
            F.col("origin_state").alias("state_code"),
            F.col("origin_latitude").alias("latitude"),
            F.col("origin_longitude").alias("longitude")
        )
            .union(
                df.select(
                    F.col("dest_airport_iata_code").alias("airport_iata_code"),
                    F.col("dest_airport_name").alias("airport_name"),
                    F.col("dest_city").alias("city_name"),
                    F.col("dest_state").alias("state_code"),
                    F.col("dest_latitude").alias("latitude"),
                    F.col("dest_longitude").alias("longitude")
                )
            )
            .distinct()
            .withColumn("airport_id", F.monotonically_increasing_id())
            .select("airport_id", "airport_iata_code", "airport_name", "city_name",
                    "state_code", "latitude", "longitude")
    )

    # dim_dat
    log.info("[Materialize] Materializando 'dim_dat'.")
    dim_dat = (
        df.select(F.col("flight_date").alias("full_date"))
            .distinct()
            .withColumn("year", F.year("full_date"))
            .withColumn("month", F.month("full_date"))
            .withColumn("day", F.dayofmonth("full_date"))
            .withColumn("day_of_week", F.dayofweek("full_date"))
            .withColumn("quarter", F.quarter("full_date"))
            # Augmentation de feriado
            .join(holidays_df, F.col("full_date") == F.col("holiday_date"), "left")
            .withColumn("is_holiday", F.when(F.col("holiday_date").isNotNull(), F.lit(True)).otherwise(F.lit(False)))
            .drop("holiday_date")
            # Fim do augmentation de feriado
            .select("full_date", "year", "month", "day", "day_of_week", "quarter", "is_holiday")
    )

    # fat_flt
    log.info("[Materialize] Materializando 'fat_flt'.")
    fat_flt = (
        df.withColumn("flight_id", F.monotonically_increasing_id())
            .withColumnRenamed("flight_date", "full_date")
            .select(
                "flight_id", "full_date", "airline_iata_code",
                "origin_airport_iata_code", "dest_airport_iata_code",
                "distance",
                "air_time", "elapsed_time", "scheduled_time",
                "taxi_out", "taxi_in",
                "departure_delay", "arrival_delay", "air_system_delay", 
                "security_delay", "airline_delay", "late_aircraft_delay", "weather_delay"
            )
    )

    # Joins para adicionar fk's
    log.info("[Materialize] Adicionando fk's na 'fat_flt'.")
    fat_flt = (
        fat_flt
            .join(dim_air, on="airline_iata_code", how="left")
            .join(dim_apt.select(
                F.col("airport_id").alias("origin_airport_id"),
                F.col("airport_iata_code").alias("origin_airport_iata_code")
            ), on="origin_airport_iata_code", how="left")
            .join(dim_apt.select(
                F.col("airport_id").alias("dest_airport_id"),
                F.col("airport_iata_code").alias("dest_airport_iata_code")
            ), on="dest_airport_iata_code", how="left")
            .select(
                "flight_id", "full_date", "airline_id", "origin_airport_id", "dest_airport_id",
                "distance", "air_time", "elapsed_time", "scheduled_time", "taxi_out",
                "taxi_in", "departure_delay", "arrival_delay",
                "air_system_delay", "security_delay", "airline_delay",
                "late_aircraft_delay", "weather_delay"
            )
    )

    log.info("[Materialize] Materialização concluída.")
    
    return {
        "dim_air": dim_air,
        "dim_apt": dim_apt,
        "dim_dat": dim_dat,
        "fat_flt": fat_flt
    }


### Runner para o job `build_and_load_gold_star_schema`

In [6]:
try:
    log.info("[BuildLoad] Iniciando job de materialização da gold.")

    df = read_from_postgres(
        spark=spark,
        db_conn_id=postgres_conn_id,
        table_name="silver.silver_flights",
    )

    log.info(f"[BuildLoad] Datasets carregado a partir do PostgreSQL.")

    # Materializando
    tables = materialize_gold_layer(df)
    dim_air = tables["dim_air"]
    dim_apt = tables["dim_apt"]
    dim_dat = tables["dim_dat"]
    fat_flt = tables["fat_flt"]

    # Executa quality gate
    log.info("[BuildLoad] Iniciando quality gate.")
    run_quality_gates_gold(
        dim_air=dim_air,
        dim_apt=dim_apt,
        dim_dat=dim_dat,
        fat_flt=fat_flt
    )
    log.info("[BuildLoad] Quality gate concluído com sucesso.")

    # Define partição de saída
    processing_date = datetime.now().strftime("%Y-%m-%d")
    output_dir = Path(gold_path) / processing_date / "PARQUET"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Escreve os arquivos na gold (para debug)
    log.info("[BuildLoad] Iniciando escrita dos arquivos na camada gold.")

    dim_air.write.mode("overwrite").parquet(str(output_dir / "dim_air.parquet"))
    dim_apt.write.mode("overwrite").parquet(str(output_dir / "dim_apt.parquet"))
    dim_dat.write.mode("overwrite").parquet(str(output_dir / "dim_dat.parquet"))
    fat_flt.write.mode("overwrite").parquet(str(output_dir / "fat_flt.parquet"))

    log.info("[BuildLoad] Escrita concluída com sucesso.")

    log.info("[BuildLoad] Iniciando carga da gold.")

    tables = {
        "dim_air": dim_air,
        "dim_apt": dim_apt,
        "dim_dat": dim_dat,
        "fat_flt": fat_flt,
    }

    # Carga no PostgreSQL e validação
    for table_name, df in tables.items():
        full_table_name = f"gold.{table_name}"

        log.info(f"[BuildLoad] Carregando tabela: {full_table_name}.")
        expected_count = df.count()

        # Carga no PostgreSQL
        load_to_postgres(
            df=df,
            db_conn_id=postgres_conn_id,
            table_name=full_table_name,
            mode="overwrite"
        )

        log.info(f"[BuildLoad] Tabela '{full_table_name}' carregada. Validando integridade.")

        # Validação (fallback se falhar)
        try:
            assert_table_rowcount(
                db_conn_id=postgres_conn_id,
                table_name=full_table_name,
                expected_count=expected_count,
            )
        except Exception as e:
            log.error(f"[BuildLoad][ERROR] Validação falhou para '{full_table_name}'. Limpando tabela.")

            import psycopg2

            with psycopg2.connect(
                host=os.getenv("DB_HOST", "localhost"),
                dbname=os.getenv("DB_NAME", "postgres"),
                user=os.getenv("DB_USER", "postgres"),
                password=os.getenv("DB_PASSWORD", "postgres"),
            ) as conn_pg:
                with conn_pg.cursor() as cur:
                    cur.execute(f"TRUNCATE TABLE {full_table_name} CASCADE;")
                    conn_pg.commit()

            raise ValueError(f"[BuildLoad][ERROR] Falha na validação da tabela '{full_table_name}'.") from e

        log.info(f"[BuildLoad] Validação concluída com sucesso: {full_table_name}.")

    log.info("[BuildLoad] Carga de todas as tabelas concluída com sucesso.") 

except Exception as e:
    log.exception(f"[BuildLoad][ERROR] Falha na construção do esquema estrela: {e}.")
    raise

finally:
    log.info("[BuildLoad] Job de materialização da gold encerrado.")


2025-11-27 03:16:03 [INFO] build_and_load_gold_star_schema: [BuildLoad] Iniciando job de materialização da gold.


2025-11-27 03:16:03 [INFO] spark_helpers: [READ] Iniciando leitura de 'silver.silver_flights'.


2025-11-27 03:16:03 [WARN] spark_helpers: [WARN] Airflow indisponível, usando variáveis de ambiente para conexão PostgreSQL.


[Stage 0:>                                                          (0 + 0) / 1][Stage 0:>                                                          (0 + 1) / 1]

                                                                                2025-11-27 03:16:22 [INFO] spark_helpers: [READ] Leitura concluída: 'silver.silver_flights'. Linhas: 5208259


2025-11-27 03:16:22 [INFO] build_and_load_gold_star_schema: [BuildLoad] Datasets carregado a partir do PostgreSQL.


2025-11-27 03:16:24 [INFO] build_and_load_gold_star_schema: [Materialize] Iniciando materialização da camada gold.


2025-11-27 03:16:24 [INFO] build_and_load_gold_star_schema: [Materialize] Materializando 'dim_air'.


2025-11-27 03:16:24 [INFO] build_and_load_gold_star_schema: [Materialize] Materializando 'dim_apt'.


2025-11-27 03:16:24 [INFO] build_and_load_gold_star_schema: [Materialize] Materializando 'dim_dat'.


2025-11-27 03:16:25 [INFO] build_and_load_gold_star_schema: [Materialize] Materializando 'fat_flt'.


2025-11-27 03:16:25 [INFO] build_and_load_gold_star_schema: [Materialize] Adicionando fk's na 'fat_flt'.


2025-11-27 03:16:25 [INFO] build_and_load_gold_star_schema: [Materialize] Materialização concluída.


2025-11-27 03:16:25 [INFO] build_and_load_gold_star_schema: [BuildLoad] Iniciando quality gate.


2025-11-27 03:16:25 [INFO] quality_gates_gold: [Quality][Gold] Iniciando validações.


[Stage 3:>                                                          (0 + 1) / 1]

[Stage 3:>                                                          (0 + 1) / 1]

[Stage 3:>                                                          (0 + 1) / 1]

[Stage 3:>                                                          (0 + 1) / 1]

                                                                                2025-11-27 03:19:48 [INFO] quality_gates_gold: [Quality][Gold]      _check_unique: 'airline_iata_code' OK.


[Stage 15:>                                                         (0 + 2) / 2]

[Stage 15:>                                                         (0 + 2) / 2]

[Stage 15:>                                                         (0 + 2) / 2]

[Stage 15:>                                                         (0 + 2) / 2]

                                                                                2025-11-27 03:22:55 [INFO] quality_gates_gold: [Quality][Gold]      _check_unique: 'airport_iata_code' OK.


[Stage 27:>                 (0 + 1) / 1][Stage 28:>                 (0 + 7) / 8]



[Stage 27:>                                                         (0 + 1) / 1]

                                                                                

[Stage 27:>                                                         (0 + 1) / 1]



                                                                                2025-11-27 03:23:11 [INFO] quality_gates_gold: [Quality][Gold]      _check_unique: 'full_date' OK.


[Stage 38:>                                                         (0 + 1) / 1]

[Stage 38:>                                                         (0 + 1) / 1]

[Stage 38:>                                                         (0 + 1) / 1]



























[Stage 50:>                                                         (0 + 8) / 8]



2025-11-27 03:25:37 [INFO] quality_gates_gold: [Quality][Gold]      _check_unique: 'flight_id' OK.


2025-11-27 03:25:38 [INFO] quality_gates_gold: [Quality][Gold]      _check_no_nulls: fat_flt OK.




                                                                                2025-11-27 03:25:41 [INFO] quality_gates_gold: [Quality][Gold]           _check_fk_integrity: [fat_flt] 'airline_id' <-> 'dim_air.airline_id' OK.


2025-11-27 03:25:43 [INFO] quality_gates_gold: [Quality][Gold]           _check_fk_integrity: [fat_flt] 'origin_airport_id' <-> 'dim_apt.airport_id' OK.


2025-11-27 03:25:45 [INFO] quality_gates_gold: [Quality][Gold]           _check_fk_integrity: [fat_flt] 'dest_airport_id' <-> 'dim_apt.airport_id' OK.




                                                                                2025-11-27 03:25:47 [INFO] quality_gates_gold: [Quality][Gold]           _check_fk_integrity: [fat_flt] 'full_date' <-> 'dim_dat.full_date' OK.


2025-11-27 03:25:47 [INFO] quality_gates_gold: [Quality][Gold] Todas as validações concluídas com sucesso.


2025-11-27 03:25:47 [INFO] build_and_load_gold_star_schema: [BuildLoad] Quality gate concluído com sucesso.


2025-11-27 03:25:47 [INFO] build_and_load_gold_star_schema: [BuildLoad] Iniciando escrita dos arquivos na camada gold.




                                                                                

[Stage 100:>                                                        (0 + 8) / 8]                                                                                

[Stage 102:>                                                        (0 + 8) / 8]                                                                                













                                                                                

2025-11-27 03:26:19 [INFO] build_and_load_gold_star_schema: [BuildLoad] Escrita concluída com sucesso.


2025-11-27 03:26:19 [INFO] build_and_load_gold_star_schema: [BuildLoad] Iniciando carga da gold.


2025-11-27 03:26:19 [INFO] build_and_load_gold_star_schema: [BuildLoad] Carregando tabela: gold.dim_air.


2025-11-27 03:26:19 [WARN] spark_helpers: [WARN] Airflow indisponível, usando variáveis de ambiente para conexão PostgreSQL.


2025-11-27 03:26:19 [INFO] spark_helpers: [LOAD] Limpando tabela 'gold.dim_air' (TRUNCATE).


[Stage 111:>                                                        (0 + 8) / 8]



                                                                                2025-11-27 03:26:24 [INFO] spark_helpers: [LOAD] Carga concluída em 'gold.dim_air' (modo=append).


2025-11-27 03:26:24 [INFO] build_and_load_gold_star_schema: [BuildLoad] Tabela 'gold.dim_air' carregada. Validando integridade.


2025-11-27 03:26:24 [INFO] postgres_helpers: [AssertRowCount] Validando contagem da tabela 'gold.dim_air'.


2025-11-27 03:26:24 [INFO] postgres_helpers: [AssertRowCount] Esperado: 14 | Encontrado: 14


2025-11-27 03:26:24 [INFO] postgres_helpers: [AssertRowCount] Validação concluída com sucesso.


2025-11-27 03:26:24 [INFO] build_and_load_gold_star_schema: [BuildLoad] Validação concluída com sucesso: gold.dim_air.


2025-11-27 03:26:24 [INFO] build_and_load_gold_star_schema: [BuildLoad] Carregando tabela: gold.dim_apt.


2025-11-27 03:26:24 [WARN] spark_helpers: [WARN] Airflow indisponível, usando variáveis de ambiente para conexão PostgreSQL.


2025-11-27 03:26:24 [INFO] spark_helpers: [LOAD] Limpando tabela 'gold.dim_apt' (TRUNCATE).




2025-11-27 03:26:27 [INFO] spark_helpers: [LOAD] Carga concluída em 'gold.dim_apt' (modo=append).


2025-11-27 03:26:27 [INFO] build_and_load_gold_star_schema: [BuildLoad] Tabela 'gold.dim_apt' carregada. Validando integridade.


2025-11-27 03:26:27 [INFO] postgres_helpers: [AssertRowCount] Validando contagem da tabela 'gold.dim_apt'.


2025-11-27 03:26:27 [INFO] postgres_helpers: [AssertRowCount] Esperado: 322 | Encontrado: 322


2025-11-27 03:26:27 [INFO] postgres_helpers: [AssertRowCount] Validação concluída com sucesso.


2025-11-27 03:26:27 [INFO] build_and_load_gold_star_schema: [BuildLoad] Validação concluída com sucesso: gold.dim_apt.


2025-11-27 03:26:27 [INFO] build_and_load_gold_star_schema: [BuildLoad] Carregando tabela: gold.dim_dat.


2025-11-27 03:26:28 [WARN] spark_helpers: [WARN] Airflow indisponível, usando variáveis de ambiente para conexão PostgreSQL.


2025-11-27 03:26:28 [INFO] spark_helpers: [LOAD] Limpando tabela 'gold.dim_dat' (TRUNCATE).




2025-11-27 03:26:30 [INFO] spark_helpers: [LOAD] Carga concluída em 'gold.dim_dat' (modo=append).


2025-11-27 03:26:30 [INFO] build_and_load_gold_star_schema: [BuildLoad] Tabela 'gold.dim_dat' carregada. Validando integridade.


2025-11-27 03:26:30 [INFO] postgres_helpers: [AssertRowCount] Validando contagem da tabela 'gold.dim_dat'.


2025-11-27 03:26:30 [INFO] postgres_helpers: [AssertRowCount] Esperado: 334 | Encontrado: 334


2025-11-27 03:26:30 [INFO] postgres_helpers: [AssertRowCount] Validação concluída com sucesso.


2025-11-27 03:26:30 [INFO] build_and_load_gold_star_schema: [BuildLoad] Validação concluída com sucesso: gold.dim_dat.


2025-11-27 03:26:30 [INFO] build_and_load_gold_star_schema: [BuildLoad] Carregando tabela: gold.fat_flt.


2025-11-27 03:26:31 [WARN] spark_helpers: [WARN] Airflow indisponível, usando variáveis de ambiente para conexão PostgreSQL.


2025-11-27 03:26:31 [INFO] spark_helpers: [LOAD] Limpando tabela 'gold.fat_flt' (TRUNCATE).






























                                                                                

2025-11-27 03:35:32 [INFO] spark_helpers: [LOAD] Carga concluída em 'gold.fat_flt' (modo=append).


2025-11-27 03:35:32 [INFO] build_and_load_gold_star_schema: [BuildLoad] Tabela 'gold.fat_flt' carregada. Validando integridade.


2025-11-27 03:35:32 [INFO] postgres_helpers: [AssertRowCount] Validando contagem da tabela 'gold.fat_flt'.


2025-11-27 03:35:59 [INFO] postgres_helpers: [AssertRowCount] Esperado: 5,208,259 | Encontrado: 5,208,259


2025-11-27 03:35:59 [INFO] postgres_helpers: [AssertRowCount] Validação concluída com sucesso.


2025-11-27 03:35:59 [INFO] build_and_load_gold_star_schema: [BuildLoad] Validação concluída com sucesso: gold.fat_flt.


2025-11-27 03:35:59 [INFO] build_and_load_gold_star_schema: [BuildLoad] Carga de todas as tabelas concluída com sucesso.


2025-11-27 03:35:59 [INFO] build_and_load_gold_star_schema: [BuildLoad] Job de materialização da gold encerrado.


In [7]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

# Verifica arquivos

df_show = {
    "dim_air": dim_air,
    "dim_apt": dim_apt,
    "dim_dat": dim_dat,
    "fat_flt": fat_flt
}

for name, d in df_show.items():
    print(f"\n{name}\n")
    d.printSchema()
    d.limit(1).show(truncate=True)

# Verifica tabelas

jdbc_url = f"jdbc:postgresql://{os.getenv('DB_HOST', 'localhost')}:{os.getenv('DB_PORT', '5432')}/{os.getenv('DB_NAME', 'postgres')}"
connection_properties = {
    "user": os.getenv("DB_USER", "postgres"),
    "password": os.getenv("DB_PASSWORD", "postgres"),
    "driver": "org.postgresql.Driver",
}

tables_to_check = ["dim_air", "dim_apt", "dim_dat"]
for tbl in tables_to_check:
    print(f"\n gold.{tbl} \n")
    df_check = spark.read.jdbc(url=jdbc_url, table=f"gold.{tbl}", properties=connection_properties)
    df_check.limit(1).show(truncate=True)


In [8]:
# Encerrando a sessão do Spark.
spark.stop()
log.info("[BuildLoad] Sessão Spark finalizada.")


2025-11-27 03:36:01 [INFO] build_and_load_gold_star_schema: [BuildLoad] Sessão Spark finalizada.
