# 02_refinement_airports_transform_job
---
Este notebook realiza a transformação e validação do dataset de aeroportos (`airports`) da camada **Bronze** para a camada **Silver**.


In [1]:
# Parameters

run_mode = "latest"
run_date = None

bronze_path = "/opt/airflow/data-layer/bronze"
silver_path = "/opt/airflow/data-layer/silver"


In [2]:
import os
from pathlib import Path
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import DoubleType, StringType
from transformer.utils.spark_helpers import get_spark_session
from transformer.utils.file_io import find_partition
from transformer.validation.quality_gates_silver_base import run_quality_gates_silver_base
from transformer.utils.logger import get_logger

log = get_logger("refinement.airports")

spark = get_spark_session("RefinementAirports")
log.info("[Refinement][Airports] Sessão Spark iniciada.")


2025-11-12 04:09:02 [INFO] spark_helpers | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-12 04:09:02 [INFO] file_io | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-12 04:09:02 [INFO] quality_gates_silver_base | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-12 04:09:02 [INFO] refinement.airports | [INFO] Logger inicializado no modo standalone (INFO).
/usr/local/lib/python3.12/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7985301d-0705-49ab-88be-605c789204b7;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 92ms :: artifacts dl 7ms
	:: modules in use:
	org.checkerframework#checker-qual;3.42.0 from central in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------------------------------------------

In [3]:
def transform_airports(df: DataFrame) -> DataFrame:
    """
    Transforma e valida o DataFrame de aeroportos para a camada silver.

    Args:
        df (DataFrame): DataFrame bruto lido da camada bronze.

    Returns:
        DataFrame: DataFrame limpo e padronizado para a camada silver.
    """
    log.info("[Refinement][Airports] Iniciando transformações.")

    # Verifica colunas obrigatórias
    required = {"IATA_CODE", "LATITUDE", "LONGITUDE"}
    missing = required - set(df.columns)
    if missing:
        raise KeyError(f"[Refinement][Airports][ERROR] Colunas faltando no dataset: {missing}.")

    # Correções de coordenadas faltosas
    corrections = {
        "ECP": {"LATITUDE": 30.3549, "LONGITUDE": -86.6160},
        "PBG": {"LATITUDE": 44.6895, "LONGITUDE": -68.0448},
        "UST": {"LATITUDE": 42.0703, "LONGITUDE": -87.9539},
    }

    # Tipagem e renomeação de colunas principais
    df2 = (
        df.withColumnRenamed("IATA_CODE", "airport_iata_code")
          .withColumn("airport_iata_code", F.col("airport_iata_code").cast(StringType()))
          .withColumn("LATITUDE", F.col("LATITUDE").cast(DoubleType()))
          .withColumn("LONGITUDE", F.col("LONGITUDE").cast(DoubleType()))
    )

    # Remove coluna COUNTRY
    if "COUNTRY" in df2.columns:
        df2 = df2.drop("COUNTRY")

    # Aplica correções manuais de coordenadas
    for code, coords in corrections.items():
        df2 = df2.withColumn(
            "LATITUDE",
            F.when(F.col("airport_iata_code") == code, F.lit(coords["LATITUDE"])).otherwise(F.col("LATITUDE"))
        ).withColumn(
            "LONGITUDE",
            F.when(F.col("airport_iata_code") == code, F.lit(coords["LONGITUDE"])).otherwise(F.col("LONGITUDE"))
        )

    # Renomeia colunas e força lowercase
    rename_map = {
        "AIRPORT": "airport_name",
        "CITY": "city",
        "STATE": "state",
        "LATITUDE": "latitude",
        "LONGITUDE": "longitude",
    }
    for old, new in rename_map.items():
        if old in df2.columns:
            df2 = df2.withColumnRenamed(old, new)

    # Normaliza nomes para minúsculo
    df2 = df2.toDF(*[c.lower() for c in df2.columns])

    log.info("[Refinement][Airports] Transformação concluída com sucesso.")

    return df2


In [4]:
try:
    log.info("[Refinement][Airports] Iniciando job de trasnformação de 'airports'.")

    # Localiza partição de entrada e define destino
    source_partition = find_partition(bronze_path, mode=run_mode, date_str=run_date)
    src = Path(bronze_path) / source_partition / "PARQUET" / "airports.parquet"
    dst = Path(silver_path) / source_partition / "PARQUET" / "airports.parquet"

    if not src.exists():
        raise FileNotFoundError(f"[Refinement][Airports][ERROR] Arquivo não encontrado: {src}.")

    # Lê dataset bruto e aplica transformações
    log.info(f"[Refinement][Airports] Lendo dataset: {src}.")
    df = spark.read.parquet(str(src))
    df_tf = transform_airports(df)

    # Executa quality gates
    log.info("[Refinement][Airports] Executando quality gates.")

    required_cols = ["airport_iata_code", "airport_name", "city", "state", "latitude", "longitude"]
    pk_cols = ["airport_iata_code"]

    run_quality_gates_silver_base(
        df=df_tf,
        name="airports_silver",
        required_columns=required_cols,
        pk_columns=pk_cols,
    )

    log.info("[Refinement][Airports] Quality gates concluídos com sucesso.")

    # Cria diretório e escreve resultado
    dst.parent.mkdir(parents=True, exist_ok=True)
    df_tf.write.mode("overwrite").parquet(str(dst))

    log.info(f"[Refinement][Airports] Dataset salvo na camada silver: {dst}.")

except Exception as e:
    log.exception(f"[Refinement][Airports][ERROR] Falha na execução do job: {e}")
    raise
finally:
    log.info("[Refinement][Airports] Fim do job de trasnformação de 'airports'.")


2025-11-12 04:09:37 [INFO] refinement.airports | [Refinement][Airports] Iniciando job de trasnformação de 'airports'.
2025-11-12 04:09:37 [INFO] file_io | [INFO] Partição selecionada: 2025-11-12
2025-11-12 04:09:37 [INFO] refinement.airports | [Refinement][Airports] Lendo dataset: /opt/airflow/data-layer/bronze/2025-11-12/PARQUET/airports.parquet.
2025-11-12 04:09:39 [INFO] refinement.airports | [Refinement][Airports] Iniciando transformações.
2025-11-12 04:09:39 [INFO] refinement.airports | [Refinement][Airports] Transformação concluída com sucesso.
2025-11-12 04:09:39 [INFO] refinement.airports | [Refinement][Airports] Executando quality gates.
2025-11-12 04:09:39 [INFO] quality_gates_silver_base | [Quality][Refinement] Iniciando validações do dataset 'airports_silver'.
2025-11-12 04:09:41 [INFO] quality_gates_silver_base | [Quality][Refinement] airports_silver: dataset não vazio OK.
2025-11-12 04:09:41 [INFO] quality_gates_silver_base | [Quality][Refinement] airports_silver: schema 

In [9]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.

df_tf.printSchema()

df_tf.limit(5).show(truncate=False)


root
 |-- airport_iata_code: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

+-----------------+-----------------------------------+-----------+-----+--------+----------+
|airport_iata_code|airport_name                       |city       |state|latitude|longitude |
+-----------------+-----------------------------------+-----------+-----+--------+----------+
|ABE              |Lehigh Valley International Airport|Allentown  |PA   |40.65236|-75.4404  |
|ABI              |Abilene Regional Airport           |Abilene    |TX   |32.41132|-99.6819  |
|ABQ              |Albuquerque International Sunport  |Albuquerque|NM   |35.04022|-106.60919|
|ABR              |Aberdeen Regional Airport          |Aberdeen   |SD   |45.44906|-98.42183 |
|ABY              |Southwest Georgia Regional Airport |Albany     |GA   |31.53552|-84.194

In [5]:
# Encerra a sessão Spark
spark.stop()
log.info("[Refinement][Airlines] Sessão Spark finalizada.")

2025-11-12 04:09:50 [INFO] refinement.airports | [Refinement][Airlines] Sessão Spark finalizada.
