# 03_etl_silver_to_gold
---
Este notebook executa o processo `ETL` que transfere os dados da camada **Silver** para a **Gold**, englobando normalização, movimentação dos arquivos e carga dos dados no **PostgreSQL**, dando finalidade ao pipeline.


In [1]:
# Parameters

run_mode = "latest"
run_date = None

silver_path = "/opt/airflow/data-layer/silver"
gold_path = "/opt/airflow/data-layer/gold"

aggregated_name = "flights_aggregated.parquet"
postgres_conn_id = "AIRFLOW_VAR_POSTGRES_CONN_ID"


In [None]:
import os
import shutil
from datetime import datetime
from pathlib import Path

from transformer.utils.file_io import find_partition, delete_files
from transformer.utils.logger import get_logger
from transformer.utils.spark_helpers import get_spark_session, load_to_postgres
from transformer.utils.postgre_helpers import assert_table_rowcount
from transformer.utils.quality_gates_gold import run_quality_gates_gold

from pyspark.sql import functions as F
from pyspark.sql.types import DateType
from pyspark.sql import SparkSession


## Job 1: move_file_silver_to_gold

Este job realiza a copia o dataset agregado `flights_aggregated.parquet` da camada **Silver** para a **Gold**.


In [None]:
log = get_logger("move_file_silver_to_gold")

spark = get_spark_session("MoveFileSilverToGold")
log.info("[MoveFileGold] SparkSession iniciada.")


### Runner para o job `move_file_silver_to_gold`

In [None]:
try:
    log.info("[MoveFileGold] Iniciando job de movimentação.")

    # Resolve partição e caminhos
    source_partition = find_partition(silver_path, mode=run_mode, date_str=run_date)
    source_dir = Path(silver_path) / source_partition / "PARQUET" / aggregated_name

    if not source_dir.exists():
        raise FileNotFoundError(f"[MoveFileGold][ERROR] Arquivo agregado não encontrado: {source_dir}.")

    # Define data de processamento e caminho destino
    processing_date = datetime.now().strftime("%Y-%m-%d")
    dest_dir = Path(gold_path) / processing_date / "PARQUET" / aggregated_name
    dest_dir.parent.mkdir(parents=True, exist_ok=True)

    log.info(f"[MoveFileGold] Movendo '{source_dir}' -> '{dest_dir}'.")

    # Cópia do arquivo
    shutil.copytree(source_dir, dest_dir, dirs_exist_ok=True)
    log.info("[MoveFileGold] Cópia concluída com sucesso.")

except Exception as e:
    log.exception(f"[MoveFileGold][ERROR] Falha durante execução: {e}.")
    raise

finally:
    log.info("[MoveFileGold] Job de movimentação encerrado.")


In [6]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
for item in (Path(gold_path) / partition / "PARQUET").iterdir():
    print(item)


In [None]:
# Encerrando a sessão do Spark.
spark.stop()
log.info("[MoveFileGold] Sessão Spark finalizada.")


## Job 2: build_gold_star_schema

Este job realiza a construção do esquema estrela da camada **Gold**, materializando as tabelas dimensionais e fato a partir do dataset `flights_aggregated.parquet`.


In [8]:
log = get_logger("build_gold_star_schema")

spark = get_spark_session("BuildGoldStarSchema")
log.info("[Materialize] SparkSession iniciada.")

# Ajustes de performance para o Spark
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "32")


2025-11-17 01:41:50 [INFO] build_gold_star_schema | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-17 01:41:51 [INFO] spark_helpers | [INFO] SparkSession criada com sucesso: 'BuildGoldStarSchema' (master=local[*]).
2025-11-17 01:41:51 [INFO] build_gold_star_schema | [Materialize] SparkSession iniciada.


### Definindo função de materialização

In [9]:
def materialize_gold_layer(df):
    """
    Gera as tabelas dimensionais e fato da camada gold a partir do dataset agregado.

    Args:
        df (DataFrame): DataFrame consolidado da camada gold.

    Returns:
        dict[str, DataFrame]: DataFrames correspondentes a dim_airline, dim_airport, dim_date e fato_flights.
    """
    # Feriados federais nos EUA em 2015 (UTC)
    us_holidays_2015 = [
        "2015-01-01",
        "2015-01-19",
        "2015-02-16",
        "2015-05-25",
        "2015-07-04",
        "2015-09-07",
        "2015-10-12",
        "2015-11-11",
        "2015-11-26",
        "2015-12-25",
    ]
    holidays_df = (
        spark.createDataFrame([(d,) for d in us_holidays_2015], ["holiday_date"])
            .withColumn("holiday_date", F.col("holiday_date").cast(DateType()))
    )
    
    log.info("[Materialize] Iniciando materialização da camada gold.")

    # dim_airline
    log.info("[Materialize] Materializando 'dim_airline'.")
    dim_airline = (
        df.select("airline_iata_code", "airline_name")
            .distinct()
            .withColumn("airline_id", F.monotonically_increasing_id())
            .select("airline_id", "airline_iata_code", "airline_name")
    )

    # dim_airport
    log.info("[Materialize] Materializando 'dim_airport'.")
    dim_airport = (
        df.select(
            F.col("origin_airport_iata_code").alias("airport_iata_code"),
            F.col("origin_airport_name").alias("airport_name"),
            F.col("origin_city").alias("city_name"),
            F.col("origin_state").alias("state_code"),
            F.col("origin_latitude").alias("latitude"),
            F.col("origin_longitude").alias("longitude")
        )
            .union(
                df.select(
                    F.col("dest_airport_iata_code").alias("airport_iata_code"),
                    F.col("dest_airport_name").alias("airport_name"),
                    F.col("dest_city").alias("city_name"),
                    F.col("dest_state").alias("state_code"),
                    F.col("dest_latitude").alias("latitude"),
                    F.col("dest_longitude").alias("longitude")
                )
            )
            .distinct()
            .withColumn("airport_id", F.monotonically_increasing_id())
            .select("airport_id", "airport_iata_code", "airport_name", "city_name",
                    "state_code", "latitude", "longitude")
    )

    # dim_date
    log.info("[Materialize] Materializando 'dim_date'.")
    dim_date = (
        df.select(F.col("flight_date").alias("full_date"))
            .distinct()
            .withColumn("year", F.year("full_date"))
            .withColumn("month", F.month("full_date"))
            .withColumn("day", F.dayofmonth("full_date"))
            .withColumn("day_of_week", F.dayofweek("full_date"))
            .withColumn("quarter", F.quarter("full_date"))
            # Augmentation de feriado
            .join(holidays_df, F.col("full_date") == F.col("holiday_date"), "left")
            .withColumn("is_holiday", F.when(F.col("holiday_date").isNotNull(), F.lit(True)).otherwise(F.lit(False)))
            .drop("holiday_date")
            # Fim do augmentation de feriado
            .select("full_date", "year", "month", "day", "day_of_week", "quarter", "is_holiday")
    )

    # fato_flights
    log.info("[Materialize] Materializando 'fato_flights'.")
    fato_flights = (
        df.withColumn("flight_id", F.monotonically_increasing_id())
            .withColumnRenamed("flight_date", "full_date")
            .select(
                "flight_id", "full_date", "airline_iata_code",
                "origin_airport_iata_code", "dest_airport_iata_code",
                "distance",
                "air_time", "elapsed_time", "scheduled_time",
                "taxi_out", "taxi_in",
                "departure_delay", "arrival_delay", "air_system_delay", 
                "security_delay", "airline_delay", "late_aircraft_delay", "weather_delay"
            )
    )

    # Joins para adicionar fk's
    log.info("[Materialize] Adicionando fk's na 'fato_flights'.")
    fato_flights = (
        fato_flights
            .join(dim_airline, on="airline_iata_code", how="left")
            .join(dim_airport.select(
                F.col("airport_id").alias("origin_airport_id"),
                F.col("airport_iata_code").alias("origin_airport_iata_code")
            ), on="origin_airport_iata_code", how="left")
            .join(dim_airport.select(
                F.col("airport_id").alias("dest_airport_id"),
                F.col("airport_iata_code").alias("dest_airport_iata_code")
            ), on="dest_airport_iata_code", how="left")
            .select(
                "flight_id", "full_date", "airline_id", "origin_airport_id", "dest_airport_id",
                "distance", "air_time", "elapsed_time", "scheduled_time", "taxi_out",
                "taxi_in", "departure_delay", "arrival_delay",
                "air_system_delay", "security_delay", "airline_delay",
                "late_aircraft_delay", "weather_delay"
            )
    )

    log.info("[Materialize] Materialização concluída.")
    
    return {
        "dim_airline": dim_airline,
        "dim_airport": dim_airport,
        "dim_date": dim_date,
        "fato_flights": fato_flights
    }


### Runner para o job `build_gold_star_schema`

In [None]:
try:
    log.info("[Materialize] Iniciando job de materialização da gold.")

    # Resolve partição e caminhos
    source_partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
    source_dir = Path(gold_path) / source_partition / "PARQUET" / aggregated_name

    if not source_dir.exists():
        raise FileNotFoundError(f"[Materialize][ERROR] Arquivo agregado não encontrado: {source_dir}")

    # Lê dataset agregado
    df = spark.read.parquet(str(source_dir))
    log.info(f"[Materialize] Dataset carregado de {source_dir}")

    # Realiza a materialização
    tables = materialize_gold_layer(df)
    dim_airline = tables["dim_airline"]
    dim_airport = tables["dim_airport"]
    dim_date = tables["dim_date"]
    fato_flights = tables["fato_flights"]

    # Executa quality gate
    log.info("[Materialize] Iniciando quality gate.")
    run_quality_gates_gold(
        dim_airline=dim_airline,
        dim_airport=dim_airport,
        dim_date=dim_date,
        fato_flights=fato_flights
    )
    log.info("[Materialize] Quality gate concluído com sucesso.")

    # Define partição de saída
    processing_date = datetime.now().strftime("%Y-%m-%d")
    output_dir = Path(gold_path) / processing_date / "PARQUET"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Escreve os arquivos na gold
    log.info("[Materialize] Iniciando escrita dos arquivos na camada gold.")

    dim_airline.write.mode("overwrite").parquet(str(output_dir / "dim_airline.parquet"))
    dim_airport.write.mode("overwrite").parquet(str(output_dir / "dim_airport.parquet"))
    dim_date.write.mode("overwrite").parquet(str(output_dir / "dim_date.parquet"))
    fato_flights.write.mode("overwrite").parquet(str(output_dir / "fato_flights.parquet"))

    log.info("[Materialize] Escrita concluída com sucesso.")

except Exception as e:
    log.exception(f"[Materialize][ERROR] Falha na construção do esquema estrela: {e}.")
    raise

finally:
    log.info("[Materialize] Job de materialização da gold encerrado.")
    

In [11]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

df_show = {
    "dim_airline": dim_airline,
    "dim_airport": dim_airport,
    "dim_date": dim_date,
    "fato_flights": fato_flights
}

for name, d in df_show.items():
    print(f"\n{name}\n")
    d.printSchema()
    d.limit(1).show(truncate=True)


In [12]:
# Encerrando a sessão do Spark.
spark.stop()
log.info("[Materialize] Sessão Spark finalizada.")


2025-11-17 01:44:44 [INFO] build_gold_star_schema | [Materialize] Sessão Spark finalizada.


## Job 3: load_gold_to_postgres_job

Este job executa a carga das tabelas da camada **Gold** no PostgreSQL (schema `gold`).


In [None]:
log = get_logger("load_gold_to_postgres")

spark = get_spark_session("LoadGoldToPostgres")
log.info("[LoadGold] SparkSession iniciada.")


### Runner para o job `load_gold_to_postgres_job`

In [None]:
try:
    log.info("[LoadGold] Iniciando job de carga da gold.")
    
    # Resolve partição e caminhos
    partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
    partition_dir = Path(gold_path) / partition / "PARQUET"

    if not partition_dir.exists():
        raise FileNotFoundError(f"[LoadGold][ERROR] Diretório de partição não encontrado: {partition_dir}")

    log.info(f"[LoadGold] Partição selecionada: {partition_dir}.")

    # Lê os parquets em memória
    tables = {
        "dim_airline": spark.read.parquet(str(partition_dir / "dim_airline.parquet")),
        "dim_airport": spark.read.parquet(str(partition_dir / "dim_airport.parquet")),
        "dim_date": spark.read.parquet(str(partition_dir / "dim_date.parquet")),
        "fato_flights": spark.read.parquet(str(partition_dir / "fato_flights.parquet")),
    }

    # Carga no PostgreSQL e validação
    for table_name, df in tables.items():
        full_table_name = f"gold.{table_name}"

        log.info(f"[LoadGold] Carregando tabela: {full_table_name}.")
        expected_count = df.count()

        # Carga no PostgreSQL
        load_to_postgres(
            df=df,
            db_conn_id=postgres_conn_id,
            table_name=full_table_name,
            mode="overwrite"
        )

        log.info(f"[LoadGold] Tabela '{full_table_name}' carregada. Validando integridade.")

        # Validação (fallback se falhar)
        try:
            assert_table_rowcount(
                db_conn_id=postgres_conn_id,
                table_name=full_table_name,
                expected_count=expected_count,
            )
        except Exception as e:
            log.error(f"[LoadGold][ERROR] Validação falhou para '{full_table_name}'. Limpando tabela.")

            import psycopg2

            with psycopg2.connect(
                host=os.getenv("DB_HOST", "localhost"),
                dbname=os.getenv("DB_NAME", "postgres"),
                user=os.getenv("DB_USER", "postgres"),
                password=os.getenv("DB_PASSWORD", "postgres"),
            ) as conn_pg:
                with conn_pg.cursor() as cur:
                    cur.execute(f"TRUNCATE TABLE {full_table_name} CASCADE;")
                    conn_pg.commit()

            raise ValueError(f"[LoadGold][ERROR] Falha na validação da tabela '{full_table_name}'.") from e

        log.info(f"[LoadGold] Validação concluída com sucesso: {full_table_name}.")

    log.info("[LoadGold] Carga de todas as tabelas concluída com sucesso.")

except Exception as e:
    log.exception(f"[LoadGold][ERROR] Falha durante carga no PostgreSQL: {e}")
    raise

finally:
    log.info("[LoadGold] Job de carga da gold encerrado.")


In [19]:
#%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.
# Vareficação manual da carga das tabelas
import os

jdbc_url = f"jdbc:postgresql://{os.getenv('DB_HOST', 'localhost')}:{os.getenv('DB_PORT', '5432')}/{os.getenv('DB_NAME', 'postgres')}"
connection_properties = {
    "user": os.getenv("DB_USER", "postgres"),
    "password": os.getenv("DB_PASSWORD", "postgres"),
    "driver": "org.postgresql.Driver",
}

tables_to_check = ["dim_airline", "dim_airport", "dim_date"]
for tbl in tables_to_check:
    print(f"\n gold.{tbl} \n")
    df_check = spark.read.jdbc(url=jdbc_url, table=f"gold.{tbl}", properties=connection_properties)
    df_check.limit(1).show(truncate=True)



 gold.dim_airline 

+----------+-----------------+--------------------+
|airline_id|airline_iata_code|        airline_name|
+----------+-----------------+--------------------+
|         0|               EV|Atlantic Southeas...|
+----------+-----------------+--------------------+


 gold.dim_airport 

+----------+-----------------+--------------------+----------+----------+-----------+--------+----------+
|airport_id|airport_iata_code|        airport_name|state_code|state_name|  city_name|latitude| longitude|
+----------+-----------------+--------------------+----------+----------+-----------+--------+----------+
|         0|              LAX|Los Angeles Inter...|        CA|      NULL|Los Angeles|33.94254|-118.40807|
+----------+-----------------+--------------------+----------+----------+-----------+--------+----------+


 gold.dim_date 

+----------+----+-----+---+-----------+-------+----------+
| full_date|year|month|day|day_of_week|quarter|is_holiday|
+----------+----+-----+---+---

In [26]:
# Encerrando a sessão do Spark.
spark.stop()
log.info("[LoadGold] Sessão Spark finalizada.")


2025-11-17 02:35:04 [INFO] load_gold_to_postgres | [LoadGold] Sessão Spark finalizada.
INFO:load_gold_to_postgres:[LoadGold] Sessão Spark finalizada.


## Job 4: cleanup_gold

Este job realiza a limpeza dos arquivos temporários presentes na camada gold, removendo o dataset intermediário `flights_aggregated.parquet` gerado na transição, mantendo a organização da camada **Gold**.


In [3]:
log = get_logger("cleanup_gold")

spark = get_spark_session("GoldCleanup")
log.info("[GoldCleanup] SparkSession iniciada.")


2025-11-17 02:35:56 [INFO] cleanup_gold | [INFO] Logger inicializado no modo standalone (INFO).
/usr/local/lib/python3.12/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ba35de13-648a-466e-a1a2-ec0688084796;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 152ms :: artifacts dl 5ms
	:: modules in use:
	org.checkerframework#checker-qual;3.42.0 from central in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------

### Runner para o job `cleanup_gold`

In [5]:
try:
    log.info("[GoldCleanup] Iniciando job de limpeza da gold.")

    # Resolve partição e caminhos
    partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
    partition_dir = Path(gold_path) / partition / "PARQUET"
    target_file = partition_dir / "flights_aggregated.parquet"

    # Verifica se o arquivo temporário existe e limpando
    if not target_file.exists():
        log.warning(f"[GoldCleanup][WARN] Arquivo temporário não encontrado: {target_file}.")
    else:
        log.info(f"[GoldCleanup] Removendo arquivo: {target_file}.")
        delete_files(spark, [str(target_file)])

    log.info("[GoldCleanup] Limpeza concluída com sucesso.")

except Exception as e:
    log.exception(f"[Serving][Cleanup][ERROR] Falha durante limpeza da camada gold: {e}.")
    raise
finally:
    log.info("[GoldCleanup] Job de limpeza da gold encerrado.")


2025-11-17 02:39:57 [INFO] cleanup_gold | [GoldCleanup] Iniciando job de limpeza da gold.
2025-11-17 02:39:57 [INFO] file_io | [INFO] Partição selecionada: 2025-11-17
2025-11-17 02:39:57 [INFO] cleanup_gold | [GoldCleanup] Removendo arquivo: /opt/airflow/data-layer/gold/2025-11-17/PARQUET/flights_aggregated.parquet.
2025-11-17 02:39:57 [INFO] file_io | [INFO] Deletando 1 arquivo(s).
2025-11-17 02:39:57 [INFO] file_io | [INFO] '/opt/airflow/data-layer/gold/2025-11-17/PARQUET/flights_aggregated.parquet' deletado com sucesso.
2025-11-17 02:39:57 [INFO] file_io | [INFO] Deleção concluída.
2025-11-17 02:39:57 [INFO] cleanup_gold | [GoldCleanup] Limpeza concluída com sucesso.
2025-11-17 02:39:57 [INFO] cleanup_gold | [GoldCleanup] Job de limpeza da gold encerrado.


In [6]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
for item in (Path(gold_path) / partition / "PARQUET").iterdir():
    print(item)


2025-11-17 02:40:07 [INFO] file_io | [INFO] Partição selecionada: 2025-11-17


/opt/airflow/data-layer/gold/2025-11-17/PARQUET/dim_airline.parquet
/opt/airflow/data-layer/gold/2025-11-17/PARQUET/dim_airport.parquet
/opt/airflow/data-layer/gold/2025-11-17/PARQUET/dim_date.parquet
/opt/airflow/data-layer/gold/2025-11-17/PARQUET/fato_flights.parquet


In [7]:
spark.stop()
log.info("[GoldCleanup] Sessão Spark finalizada.")


2025-11-17 02:40:14 [INFO] cleanup_gold | [GoldCleanup] Sessão Spark finalizada.
