# 03_refinement_flights_transform_job
---
Este notebnook transforma e normaliza o dataset de voos (`flights`) da bronze para a **Silver** (pré-join), gerando um arquivo intermediário a ser consumido na etapa de agregação.

In [1]:
# Parameters

run_mode = "latest"
run_date = None

bronze_path = "/opt/airflow/data-layer/bronze"
silver_path = "/opt/airflow/data-layer/silver"


In [2]:
from pathlib import Path
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.column import Column
from pyspark.sql.types import DoubleType
from pyspark import StorageLevel
from transformer.utils.spark_helpers import get_spark_session
from transformer.utils.file_io import find_partition
from transformer.validation.quality_gates_silver_flights import run_quality_gates_silver_flights
from transformer.utils.logger import get_logger
from transformer.utils.helpers import to_date_from_ymd

log = get_logger("refinement.flights_silver")

spark = get_spark_session("RefinementFlightsSilver")
log.info("[Refinement][Flights] Sessão Spark iniciada.")

# Ajustes performance
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "32")


2025-11-13 22:53:08 [INFO] spark_helpers | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-13 22:53:08 [INFO] file_io | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-13 22:53:08 [INFO] quality_gates.silver_flights | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-13 22:53:08 [INFO] utils.generic_helpers | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-13 22:53:08 [INFO] refinement.flights_silver | [INFO] Logger inicializado no modo standalone (INFO).
/usr/local/lib/python3.12/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-38c090a4-863e-4bfc-9343-dfeb603b1f7c;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 148ms :: artifacts dl 5ms
	:: modules in use:
	org.checkerframework#checker-qual;3.42.0 from central in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------

In [3]:
def normalize_time_expr(col_name: str) -> F.Column:
    """
    Normaliza valores de horário removendo casas decimais e preenchendo zeros à esquerda.
    """
    return F.when(
        F.col(col_name).isNotNull(),
        F.lpad(F.regexp_replace(F.col(col_name).cast("string"), r"\.0$", ""), 4, "0")
    )


def abs_min_diff(c1: str, c2: str) -> F.Column:
    """
    Retorna a diferença absoluta entre dois horários em minutos.
    """
    return F.abs(F.col(c1).cast("long") - F.col(c2).cast("long")) / 60.0

def transform_flights(df: DataFrame) -> DataFrame:
    """
    Transforma e valida o DataFrame de voos (pré-join) para a camada Silver.

    Args:
        df (DataFrame): DataFrame bruto lido da camada Bronze.

    Returns:
        DataFrame: DataFrame transformado e validado.
    """
    log.info("[Refinement][Flights] Iniciando transformações.")

    # Validação mínima de colunas obrigatórias
    required = {"YEAR", "MONTH", "DAY", "AIRLINE", "FLIGHT_NUMBER"}
    missing = required - set(df.columns)
    if missing:
        raise KeyError(f"[Refinement][Flights][ERROR] Colunas faltando: {missing}.")

    # Filtro inicial e padronização de nomes
    df2 = (
        df.filter((F.col("DIVERTED") != 1) & (F.col("CANCELLED") != 1))
          .toDF(*[c.lower() for c in df.columns])
          .withColumn("flight_date", to_date_from_ymd(F.col("year"), F.col("month"), F.col("day")))
          .withColumnRenamed("year", "flight_year")
          .withColumnRenamed("month", "flight_month")
          .withColumnRenamed("day", "flight_day")
          .withColumnRenamed("day_of_week", "flight_day_of_week")
    )

    # Normalização de horários (corrigido: sem criar colunas duplicadas)
    time_cols = [
        "scheduled_departure", "departure_time",
        "scheduled_arrival", "arrival_time",
        "wheels_off", "wheels_on",
    ]

    for col_name in time_cols:
        if col_name in df2.columns:
            tmp = f"{col_name}_str"
            df2 = (
                df2.withColumn(tmp, normalize_time_expr(col_name))
                    .withColumn(
                        col_name,
                        F.to_timestamp(
                            F.concat_ws(" ", F.col("flight_date").cast("string"), F.col(tmp)),
                            "yyyy-MM-dd HHmm"
                        )
                    )
                    .drop(tmp)
            )

    # Detecção de horários trocados (corrigido: sem withColumns)
    df2 = (
        df2
        .withColumn("diff_dep_sched_arr", abs_min_diff("departure_time", "scheduled_arrival"))
        .withColumn("diff_dep_sched_dep", abs_min_diff("departure_time", "scheduled_departure"))
        .withColumn("diff_arr_sched_dep", abs_min_diff("arrival_time", "scheduled_departure"))
        .withColumn("diff_arr_sched_arr", abs_min_diff("arrival_time", "scheduled_arrival"))
        .withColumn(
            "is_swapped",
            (F.col("diff_dep_sched_arr") < F.col("diff_dep_sched_dep")) &
            (F.col("diff_arr_sched_dep") < F.col("diff_arr_sched_arr"))
        )
        .withColumn(
            "departure_time_tmp",
            F.when(F.col("is_swapped"), F.col("arrival_time")).otherwise(F.col("departure_time"))
        )
        .withColumn(
            "arrival_time_tmp",
            F.when(F.col("is_swapped"), F.col("departure_time")).otherwise(F.col("arrival_time"))
        )
        .drop("departure_time", "arrival_time")
        .withColumnRenamed("departure_time_tmp", "departure_time")
        .withColumnRenamed("arrival_time_tmp", "arrival_time")
        .drop(
            "diff_dep_sched_arr", "diff_dep_sched_dep",
            "diff_arr_sched_dep", "diff_arr_sched_arr", "is_swapped"
        )
    )

    # Conversão numérica e substituição de nulos
    numeric_cols = [
        "departure_delay", "arrival_delay", "taxi_out", "taxi_in",
        "air_time", "elapsed_time", "scheduled_time", "distance",
        "air_system_delay", "security_delay", "airline_delay",
        "late_aircraft_delay", "weather_delay",
    ]
    delay_cols = [
        "air_system_delay", "security_delay", "airline_delay",
        "late_aircraft_delay", "weather_delay",
    ]

    for c in numeric_cols:
        if c in df2.columns:
            expr = F.col(c).cast(DoubleType())
            if c in delay_cols:
                expr = F.coalesce(expr, F.lit(0.0))
            df2 = df2.withColumn(c, expr)

    # Ajuste de voos overnight
    df2 = (
        df2.withColumn(
            "is_overnight_flight",
            F.when(
                (F.col("arrival_time").isNotNull()) &
                (F.col("departure_time").isNotNull()) &
                (F.hour(F.col("arrival_time")) < F.hour(F.col("departure_time"))),
                F.lit(True)
            ).otherwise(F.lit(False))
        )
        .withColumn(
            "arrival_time",
            F.when(F.col("is_overnight_flight"),
                   F.col("arrival_time") + F.expr("INTERVAL 1 DAY"))
             .otherwise(F.col("arrival_time"))
        )
    )

    # Filtros finais
    df2 = df2.filter(
        (F.col("departure_time").isNotNull()) &
        (F.col("arrival_time").isNotNull()) &
        (F.col("arrival_time") > F.col("departure_time")) &
        (~F.col("origin_airport").rlike("^[0-9]+$")) &
        (~F.col("destination_airport").rlike("^[0-9]+$"))
    )

    # Remoção de colunas desnecessárias
    drop_cols = [c for c in ["diverted", "cancelled", "cancellation_reason"] if c in df2.columns]
    if drop_cols:
        df2 = df2.drop(*drop_cols)

    log.info("[Refinement][Flights] Transformação concluída.")

    return df2


In [4]:
try:
    log.info("[Refinement][Flights] Iniciando job de trasnformação de 'flights'.")

    # Resolve partição e caminhos
    source_partition = find_partition(bronze_path, mode=run_mode, date_str=run_date)
    src = Path(bronze_path) / source_partition / "PARQUET" / "flights.parquet"
    dst_dir = Path(silver_path) / source_partition / "PARQUET"
    dst = dst_dir / "flights_pre_join.parquet"
    airports_src = dst_dir / "airports.parquet"

    if not src.exists():
        raise FileNotFoundError(f"[Refinement][Flights][ERROR] Arquivo não encontrado: {src}.")
    if not airports_src.exists():
        raise FileNotFoundError(f"[Refinement][Flights][ERROR] Airports não encontrado na silver: {airports_src}.")

    # Leitura e transformação
    log.info(f"[Refinement][Flights] Lendo datasets: {src} e {airports_src}.")
    
    df = spark.read.parquet(str(src))
    airports_df = spark.read.parquet(str(airports_src))
    df_tf = transform_flights(df)
    
    # Quality gate
    run_quality_gates_silver_flights(df_tf, airports_df)

    # Escrita do resultado intermediário
    dst_dir.mkdir(parents=True, exist_ok=True)
    df_tf.coalesce(1).write.mode("overwrite").parquet(str(dst))

    log.info(f"[Refinement][Flights] Dataset salvo na silver: {dst}.")

    # Libera cache
    df_tf.unpersist()

except Exception as e:
    log.exception(f"[Refinement][Flights][ERROR] Falha na execução do job: {e}.")
    raise
finally:
    log.info("[Refinement][Flights] Fim do job de transformação de 'flights'.")


2025-11-13 22:53:39 [INFO] refinement.flights_silver | [Refinement][Flights] Iniciando job de trasnformação de 'flights'.
2025-11-13 22:53:39 [INFO] file_io | [INFO] Partição selecionada: 2025-11-12
2025-11-13 22:53:39 [INFO] refinement.flights_silver | [Refinement][Flights] Lendo datasets: /opt/airflow/data-layer/bronze/2025-11-12/PARQUET/flights.parquet e /opt/airflow/data-layer/silver/2025-11-12/PARQUET/airports.parquet.
2025-11-13 22:53:42 [INFO] refinement.flights_silver | [Refinement][Flights] Iniciando transformações.
2025-11-13 22:53:42 [INFO] utils.generic_helpers | [INFO] Construindo coluna de data a partir de YEAR, MONTH e DAY.
2025-11-13 22:53:44 [INFO] refinement.flights_silver | [Refinement][Flights] Transformação concluída.
2025-11-13 22:53:44 [INFO] quality_gates.silver_flights | [Quality] Iniciando validações da camada silver.


root
 |-- flight_year: string (nullable = true)
 |-- flight_month: string (nullable = true)
 |-- flight_day: string (nullable = true)
 |-- flight_day_of_week: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight_number: string (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- destination_airport: string (nullable = true)
 |-- scheduled_departure: timestamp (nullable = true)
 |-- departure_delay: double (nullable = true)
 |-- taxi_out: double (nullable = true)
 |-- wheels_off: timestamp (nullable = true)
 |-- scheduled_time: double (nullable = true)
 |-- elapsed_time: double (nullable = true)
 |-- air_time: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- wheels_on: timestamp (nullable = true)
 |-- taxi_in: double (nullable = true)
 |-- scheduled_arrival: timestamp (nullable = true)
 |-- arrival_delay: double (nullable = true)
 |-- air_system_delay: double (nullable = false)
 |-- se

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
2025-11-13 22:53:46 [INFO] refinement.flights_silver | [Refinement][Flights] Fim do job de transformação de 'flights'.
INFO:refinement.flights_silver:[Refinement][Flights] Fim do job de transformação de 'flights'.


KeyboardInterrupt: 

25/11/13 22:53:47 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 75, Column 16: Compiling "processNext()"
org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 75, Column 16: Compiling "processNext()"
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:402)
	at org.codehaus.janino.UnitCompiler.access$000(UnitCompiler.java:236)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:363)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:361)
	at org.codehaus.janino.Java$CompilationUnit.accept(Java.java:371)
	at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:361)
	at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:264)
	at org.codehaus.janino.ClassBodyE

In [4]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

path_df = Path(silver_path) / "2025-11-12" / "PARQUET" / "flights_pre_join.parquet" # Verificar a data quando rodar

df = spark.read.parquet(str(path_df))

df.printSchema()

df.limit(5).show(truncate=True)


In [5]:
# Encerra a sessão Spark
spark.stop()
log.info("[Refinement][Flights] Sessão Spark finalizada.")

2025-11-12 23:38:36 [INFO] refinement.flights_silver | [Refinement][Flights] Sessão Spark finalizada.
