# 02_landing_unify_flight_chunks_job
---
Este notebook realiza a unificação dos arquivos `flights_part_*.csv` presentes na camada **Stage**, consolidando-os em um único arquivo Parquet `flights.parquet`.


In [1]:
# Parameters

stage_path = "/opt/airflow/data-layer/stage"


In [2]:
import os
from pyspark.sql import SparkSession, DataFrame
from transformer.utils.spark_helpers import get_spark_session
from transformer.utils.file_io import check_files_in_folder
from transformer.utils.logger import get_logger
from transformer.validation.quality_gates_bronze import run_quality_gates_bronze

log = get_logger("landing.unify_chunks")

spark = get_spark_session("UnifyFlightChunks")
log.info("[Landing][UnifyChunks] SparkSession iniciada.")


2025-11-12 00:01:40 [INFO] spark_helpers | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-12 00:01:40 [INFO] file_io | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-12 00:01:40 [INFO] quality_gates_bronze | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-12 00:01:40 [INFO] landing.unify_chunks | [INFO] Logger inicializado no modo standalone (INFO).
/usr/local/lib/python3.12/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f5ace125-591e-4dfe-b92e-dbc99124c9b0;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 123ms :: artifacts dl 5ms
	:: modules in use:
	org.checkerframework#checker-qual;3.42.0 from central in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------

In [3]:
def reassemble_chunks(spark: SparkSession, chunk_files: list[str], header: bool = True) -> DataFrame:
    """
    Lê múltiplos arquivos de chunk (flights_part_*.csv) e os unifica em um único DataFrame Spark.

    Args:
        spark (SparkSession): Sessão Spark ativa.
        chunk_files (list[str]): Lista de caminhos completos dos arquivos csv a serem unificados.
        header (bool, opcional): Define se os arquivos csv possuem cabeçalho. Padrão: True.

    Returns:
        DataFrame: DataFrame Spark consolidado com todos os chunks.
    """
    if not chunk_files:
        raise ValueError("[Landing][UnifyChunks][ERROR] Nenhum arquivo de chunk fornecido para unificação.")

    log.info(f"[Landing][UnifyChunks] Lendo e concatenando {len(chunk_files)} arquivo(s) de chunk.")

    try:
        df = (
            spark.read
            .option("header", header)
            .option("inferSchema", False)
            .csv(chunk_files)
        )
        
        log.info("[Landing][UnifyChunks] Unificação dos chunks concluída com sucesso.")
        
        return df

    except Exception as e:
        log.error(f"[Landing][UnifyChunks][ERROR] Falha ao ler os arquivos csv: {e}.")
        
        raise


In [4]:
try:
    log.info("[Landing][UnifyChunks] Iniciando job de unificação de chunks.")

    # Localiza arquivos csv na stage
    csv_files = check_files_in_folder(stage_path, "*.csv")
    chunk_files = [f for f in csv_files if "flights_part" in f]

    if not chunk_files:
        raise FileNotFoundError(f"[Landing][UnifyChunks][ERROR] Nenhum arquivo de chunk encontrado em {stage_path}.")

    # Unifica os chunks
    df_unified = reassemble_chunks(spark, chunk_files)

    # Executa quality gates
    required_columns = [
        'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 
        'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 
        'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 
        'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 
        'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 
        'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'
    ]
    run_quality_gates_bronze(df_unified, "flights_bronze", required_columns)

    stage_output = f"{stage_path}/flights.parquet"
    df_unified.write.mode("overwrite").option("compression", "snappy").parquet(stage_output)
    log.info(f"[Landing][UnifyChunks] Arquivo consolidado salvo em: {stage_output}.")
    
    log.info("[Landing][UnifyChunks] Job concluído com sucesso.")

except Exception as e:
    log.exception(f"[Landing][UnifyChunks][ERROR] Falha durante execução: {e}")
    raise


2025-11-12 00:01:49 [INFO] landing.unify_chunks | [Landing][UnifyChunks] Iniciando job de unificação de chunks.
2025-11-12 00:01:49 [INFO] file_io | [INFO] Encontrados 12 arquivo(s).
2025-11-12 00:01:49 [INFO] landing.unify_chunks | [Landing][UnifyChunks] Lendo e concatenando 10 arquivo(s) de chunk.
2025-11-12 00:01:52 [INFO] landing.unify_chunks | [Landing][UnifyChunks] Unificação dos chunks concluída com sucesso.
2025-11-12 00:01:52 [INFO] quality_gates_bronze | [Quality][Landing] Iniciando validações do dataset 'flights_bronze'.
2025-11-12 00:01:53 [INFO] quality_gates_bronze | [Quality][Landing] _check_row_count_not_empty para 'flights_bronze': OK.
2025-11-12 00:01:53 [INFO] quality_gates_bronze | [Quality][Landing] _check_schema_columns para 'flights_bronze': OK.
2025-11-12 00:01:53 [INFO] quality_gates_bronze | [Quality][Landing] Todas as validações para 'flights_bronze' concluídas com sucesso.
25/11/12 00:01:53 WARN SparkStringUtils: Truncated the string representation of a plan

In [5]:
# Encerrando a sessão Spark
spark.stop()
log.info("[Landing][UnifyChunks] Sessão Spark finalizada.")


2025-11-12 00:03:30 [INFO] landing.unify_chunks | [Landing][UnifyChunks] Sessão Spark finalizada.
