# 04_refinement_silver_aggregate_job
---
Este notebook realiza a agregação da camada **Silver**, unindo os datasets já tratados de `flights`, `airlines` e `airports` em um único dataset consolidado `flights_aggregated.parquet`, conforme o `DDL` da camada.

In [4]:
# Parameters

run_mode = "latest"
run_date = None

silver_path = "/opt/airflow/data-layer/silver"


In [5]:
from pathlib import Path
from pyspark.sql import DataFrame, functions as F
from pyspark import StorageLevel

from transformer.utils.spark_helpers import get_spark_session
from transformer.utils.file_io import find_partition
from transformer.validation.quality_gates_silver_aggregated import run_quality_gates_silver_aggregated
from transformer.utils.logger import get_logger

log = get_logger("refinement.silver_aggregate")

spark = get_spark_session("RefinementSilverAggregate")
log.info("[Refinement][Aggregate] Sessão Spark iniciada.")

# Ajustes performance
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "32")


2025-11-13 23:08:29 [INFO] refinement.silver_aggregate | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-13 23:08:29 [INFO] spark_helpers | [INFO] SparkSession criada com sucesso: 'RefinementSilverAggregate' (master=local[*]).
2025-11-13 23:08:29 [INFO] refinement.silver_aggregate | [Refinement][Aggregate] Sessão Spark iniciada.


In [6]:
def create_aggregated_flights_df(
    flights_silver_df: DataFrame,
    airlines_silver_df: DataFrame,
    airports_silver_df: DataFrame,
) -> DataFrame:
    """
    Constrói o DataFrame agregado da camada Silver (flights_aggregated), unindo:
        - flights_pre_join.parquet
        - airlines.parquet
        - airports.parquet

    Args:
        flights_silver_df (DataFrame): Dataset de voos já transformado (pré-join).
        airlines_silver_df (DataFrame): Dataset transformado de companhias aéreas.
        airports_silver_df (DataFrame): Dataset transformado de aeroportos.

    Returns:
        DataFrame: Dataset consolidado no formato final da camada Silver.
    """

    log.info("[Refinement][Aggregate] Iniciando agregação dos datasets Silver.")

    # Detecta coluna de companhia aérea no dataset 'flights'
    if "airline_iata_code" in flights_silver_df.columns:
        airline_col = "airline_iata_code"
    elif "airline" in flights_silver_df.columns:
        airline_col = "airline"
    else:
        raise KeyError("Nenhuma coluna de companhia aérea encontrada no dataset flights.")

    # Detecta colunas de origem e destino no dataset 'flights'
    if "origin_airport_iata_code" in flights_silver_df.columns:
        origin_col = "origin_airport_iata_code"
    elif "origin_airport" in flights_silver_df.columns:
        origin_col = "origin_airport"
    else:
        raise KeyError("Coluna de aeroporto de origem não encontrada.")

    if "dest_airport_iata_code" in flights_silver_df.columns:
        dest_col = "dest_airport_iata_code"
    elif "destination_airport" in flights_silver_df.columns:
        dest_col = "destination_airport"
    else:
        raise KeyError("Coluna de aeroporto de destino não encontrada.")

    # Join com airlines
    df_joined = flights_silver_df.join(
        airlines_silver_df,
        flights_silver_df[airline_col] == airlines_silver_df["airline_iata_code"],
        how="left",
    )

    # Seleciona campos para aeroportos de origem
    df_origin = (
        airports_silver_df.select(
            F.col("airport_iata_code").alias("origin_airport_iata_code"),
            F.col("airport_name").alias("origin_airport_name"),
            F.col("city").alias("origin_city"),
            F.col("state").alias("origin_state"),
            F.col("latitude").alias("origin_latitude"),
            F.col("longitude").alias("origin_longitude"),
        )
    )

    # Seleciona campos para aeroportos de destino
    df_dest = (
        airports_silver_df.select(
            F.col("airport_iata_code").alias("dest_airport_iata_code"),
            F.col("airport_name").alias("dest_airport_name"),
            F.col("city").alias("dest_city"),
            F.col("state").alias("dest_state"),
            F.col("latitude").alias("dest_latitude"),
            F.col("longitude").alias("dest_longitude"),
        )
    )

    # Join com aeroportos de origem e destino
    df_joined = (
        df_joined.join(
            df_origin,
            df_joined[origin_col] == F.col("origin_airport_iata_code"),
            how="left",
        )
        .join(
            df_dest,
            df_joined[dest_col] == F.col("dest_airport_iata_code"),
            how="left",
        )
    )

    # Mapeamento dos tipos
    schema_casts = {
        "flight_id": "bigint",
        "flight_year": "smallint",
        "flight_month": "smallint",
        "flight_day": "smallint",
        "flight_day_of_week": "smallint",
        "flight_date": "date",

        "airline_iata_code": "string",
        "airline_name": "string",

        "flight_number": "int",
        "tail_number": "string",

        "origin_airport_iata_code": "string",
        "origin_airport_name": "string",
        "origin_city": "string",
        "origin_state": "string",
        "origin_latitude": "double",
        "origin_longitude": "double",

        "dest_airport_iata_code": "string",
        "dest_airport_name": "string",
        "dest_city": "string",
        "dest_state": "string",
        "dest_latitude": "double",
        "dest_longitude": "double",

        "scheduled_departure": "timestamp",
        "departure_time": "timestamp",
        "scheduled_arrival": "timestamp",
        "arrival_time": "timestamp",
        "wheels_off": "timestamp",
        "wheels_on": "timestamp",

        "departure_delay": "double",
        "arrival_delay": "double",
        "taxi_out": "double",
        "taxi_in": "double",
        "air_time": "double",
        "elapsed_time": "double",
        "scheduled_time": "double",
        "distance": "double",

        "is_overnight_flight": "boolean",

        "air_system_delay": "double",
        "security_delay": "double",
        "airline_delay": "double",
        "late_aircraft_delay": "double",
        "weather_delay": "double",
    }

    for col_name, spark_type in schema_casts.items():
        if col_name in df_joined.columns:
            df_joined = df_joined.withColumn(col_name, F.col(col_name).cast(spark_type))


    # Seleção final conforme o ddl
    final_df = df_joined.select(
        F.monotonically_increasing_id().alias("flight_id"),
        "flight_year",
        "flight_month",
        "flight_day",
        "flight_day_of_week",
        "flight_date",
        "airline_iata_code",
        "airline_name",
        "flight_number",
        "tail_number",
        "origin_airport_iata_code",
        "origin_airport_name",
        "origin_city",
        "origin_state",
        "origin_latitude",
        "origin_longitude",
        "dest_airport_iata_code",
        "dest_airport_name",
        "dest_city",
        "dest_state",
        "dest_latitude",
        "dest_longitude",
        "scheduled_departure",
        "departure_time",
        "scheduled_arrival",
        "arrival_time",
        "wheels_off",
        "wheels_on",
        "departure_delay",
        "arrival_delay",
        "taxi_out",
        "taxi_in",
        "air_time",
        "elapsed_time",
        "scheduled_time",
        "distance",
        "air_system_delay",
        "security_delay",
        "airline_delay",
        "late_aircraft_delay",
        "weather_delay",
    )

    log.info("[Refinement][Aggregate] Agregação concluída com sucesso.")

    return final_df


In [7]:
try:
    log.info("[Refinement][Aggregate] Iniciando job de agregação da camada Silver.")

    # Encontra partição de entrada
    source_partition: str = find_partition(
        base_path=silver_path,
        mode=run_mode,
        date_str=run_date,
    )
    base_dir: Path = Path(silver_path) / source_partition / "PARQUET"

    flights_path: Path  = base_dir / "flights_pre_join.parquet"
    airlines_path: Path = base_dir / "airlines.parquet"
    airports_path: Path = base_dir / "airports.parquet"

    # Verifica existência dos arquivos necessários
    for required_file in [flights_path, airlines_path, airports_path]:
        if not required_file.exists():
            raise FileNotFoundError(
                f"[Refinement][Aggregate][ERROR] Arquivo esperado não encontrado: {required_file}"
            )

    # Leitura dos datasets
    log.info("[Refinement][Aggregate] Lendo datasets Silver (flights, airlines, airports).")

    df_flights  = spark.read.parquet(str(flights_path))
    df_airlines = spark.read.parquet(str(airlines_path))
    df_airports = spark.read.parquet(str(airports_path))

    # Construção do DataFrame agregado
    aggregated_df: DataFrame = create_aggregated_flights_df(
        flights_silver_df=df_flights,
        airlines_silver_df=df_airlines,
        airports_silver_df=df_airports,
    )

    # Filtra registros inválidos de aeroportos
    aggregated_df = aggregated_df.filter(
        F.col("origin_airport_iata_code").isNotNull()
        & F.col("dest_airport_iata_code").isNotNull()
    )

    # Quality gates
    run_quality_gates_silver_aggregated(aggregated_df)

    # Escrita do arquivo final
    output_path: Path = base_dir / "flights_aggregated.parquet"
    aggregated_df.coalesce(1).write.mode("overwrite").parquet(str(output_path))

    log.info(f"[Refinement][Aggregate] Dataset agregado salvo em: {output_path}")

    # Libera cache após uso
    aggregated_df.unpersist()

except Exception as e:
    log.exception(f"[Refinement][Aggregate][ERROR] Falha na execução do job: {e}")
    raise

finally:
    log.info("[Refinement][Aggregate] Job de agregação encerrado.")


2025-11-13 23:08:35 [INFO] refinement.silver_aggregate | [Refinement][Aggregate] Iniciando job de agregação da camada Silver.
2025-11-13 23:08:35 [INFO] file_io | [INFO] Partição selecionada: 2025-11-12
2025-11-13 23:08:35 [INFO] refinement.silver_aggregate | [Refinement][Aggregate] Lendo datasets Silver (flights, airlines, airports).
2025-11-13 23:08:38 [INFO] refinement.silver_aggregate | [Refinement][Aggregate] Iniciando agregação dos datasets Silver.
2025-11-13 23:08:39 [INFO] refinement.silver_aggregate | [Refinement][Aggregate] Agregação concluída com sucesso.
2025-11-13 23:08:39 [INFO] quality_gates.silver_aggregated | [Quality][Aggregate] Iniciando validações do dataset agregado.
2025-11-13 23:08:42 [INFO] quality_gates.silver_aggregated | [Quality][Aggregate]      _check_row_count_not_empty: OK
2025-11-13 23:08:51 [INFO] quality_gates.silver_aggregated | [Quality][Aggregate]     _check_unique_primary_key: OK
2025-11-13 23:08:59 [INFO] quality_gates.silver_aggregated | [Quality

In [7]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

aggregated_df.printSchema()

aggregated_df.limit(5).show(truncate=True)


In [8]:
# Encerra a sessão Spark
spark.stop()
log.info("[Refinement][Aggregate] Sessão Spark finalizada.")


2025-11-13 23:10:20 [INFO] refinement.silver_aggregate | [Refinement][Aggregate] Sessão Spark finalizada.
