# [DEBUG MODE] ETL_LANDING_STAGE_TO_BRONZE

In [1]:
stage_path = None


In [15]:
from transformer.utils.project_context import init_project
init_project()

[INFO] Project root already in sys.path: /home/jovyan


PosixPath('/home/jovyan')

In [2]:
import os
from datetime import datetime
from pathlib import Path
from transformer.utils.spark_helpers import get_spark_session, save_df_as_parquet_file
from transformer.utils.file_io import check_files_in_folder, move_files, delete_files
from transformer.landing.reassemble_chunks import reassemble_chunks


In [3]:
stage_path = os.getenv("DATA_LAYER_STAGE_PATH", "/opt/airflow/data-layer/stage")
bronze_path = os.getenv("DATA_LAYER_BRONZE_PATH", "/opt/airflow/data-layer/bronze")
spark_cores = os.getenv("SPARK_CORES", "*")

print(f"Stage path: {stage_path}")
print(f"Bronze path: {bronze_path}")
print(f"Spark cores: {spark_cores}")


Stage path: /opt/airflow/data-layer/stage
Bronze path: /opt/airflow/data-layer/bronze
Spark cores: *


# Inicialização a seção do spark e criando arquivo/pasta de log

In [4]:
LOG_DIR = Path("/opt/airflow/data-layer/logs/debug/landing")
LOG_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE = LOG_DIR / f"landing_debug_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

def log(msg: str):
    timestamp = datetime.now().strftime("%H:%M:%S")
    formatted = f"[{timestamp}] {msg}"
    print(formatted)
    with open(LOG_FILE, "a") as f:
        f.write(formatted + "\n")

spark_conf = {"spark.hadoop.fs.file.impl": "org.apache.hadoop.fs.RawLocalFileSystem"}
spark = get_spark_session("LandingDebugManual", master=f"local[{spark_cores}]", additional_configs=spark_conf)

log(f"Spark session iniciada.")


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6bc83f26-245e-4b37-a6ab-59ea041531e0;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
downloading https://repo1.maven.org/maven2/org/postgresql/postgresql/42.7.3/postgresql-42.7.3.jar ...
	[SUCCESSFUL ] org.postgresql#postgresql;42.7.3!postgresql.jar (249ms)
downloading https://repo1.maven.org/maven2/org/checkerframework/checker-qual/3.42.0/checker-qual-3.42.0.jar ...
	[SUCCESSFUL ] org.checkerframework#checker-qual;3.42.0!checker-qual.jar (144ms)
:: resolution report :: resolve 4020ms :: artifacts dl 400ms
	:: modules in use:
	org.checkerf

[02:30:40] Spark session iniciada.


# Task 1 - check_stage_files

In [5]:
try:
    log(f"[INFO] Verificando arquivos csv na stage: {stage_path}")
    files = check_files_in_folder(stage_path, "*.csv")
    log(f"[INFO] {len(files)} arquivo(s) encontrados.")
    for f in files:
        log(f"  - {f}")
except Exception as e:
    log(f"[ERRO] Falha ao listar arquivos: {e}")


[02:30:43] [INFO] Verificando arquivos csv na stage: /opt/airflow/data-layer/stage
[02:30:43] [INFO] 12 arquivo(s) encontrados.
[02:30:43]   - /opt/airflow/data-layer/stage/airlines.csv
[02:30:43]   - /opt/airflow/data-layer/stage/airports.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_01.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_02.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_03.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_04.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_05.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_06.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_07.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_08.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_09.csv
[02:30:43]   - /opt/airflow/data-layer/stage/flights_part_10.csv


# Task 2 - unify_flight_chunks

In [6]:
try:
    chunks = [f for f in files if "flights_part" in f]
    if not chunks:
        log("[WARN] Nenhum chunk de voo encontrado.")
        flights_df = None
    else:
        log(f"[INFO] Iniciando unificação de {len(chunks)} chunks.")
        flights_df = reassemble_chunks(spark, chunks)
        parquet_out = save_df_as_parquet_file(flights_df, stage_path, "flights.parquet", single_file=True)
        log(f"[SUCCESS] Arquivo unificado salvo em: {parquet_out}")
except Exception as e:
    log(f"[ERRO] Falha na unificação: {e}.")
    flights_df = None


[02:30:49] [INFO] Iniciando unificação de 10 chunks.


25/10/31 02:30:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[02:32:17] [SUCCESS] Arquivo unificado salvo em: /opt/airflow/data-layer/stage/flights.parquet


In [14]:
flights_df.printSchema()

root
 |-- YEAR: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- DAY: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: string (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: string (nullable = true)
 |-- DEPARTURE_TIME: string (nullable = true)
 |-- DEPARTURE_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- SCHEDULED_TIME: string (nullable = true)
 |-- ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- SCHEDULED_ARRIVAL: string (nullable = true)
 |-- ARRIVAL_TIME: string (nullable = true)
 |-- ARRIVAL_DELAY: string (nullable = true)
 |-- D

# Task 3 - convert_csv_to_parquet

In [9]:
try:
    other_csvs = [f for f in files if "flights_part" not in f]
    converted_files = []

    if not other_csvs:
        log("[WARN] Nenhum csv adicional encontrado.")
    else:
        for csv_file in other_csvs:
            log(f"[INFO] Convertendo {csv_file}.")
            df = spark.read.csv(csv_file, header=True, inferSchema=True)
            name = os.path.basename(csv_file).replace(".csv", ".parquet")
            save_df_as_parquet_file(df, stage_path, name)
            converted_files.append(name)
            log(f"[INFO] Arquivo convertido: {name}.")
except Exception as e:
    log(f"[ERRO] Falha na conversão: {e}")


[02:34:48] [INFO] Convertendo /opt/airflow/data-layer/stage/airlines.csv.
[02:34:49] [INFO] Arquivo convertido: airlines.parquet.
[02:34:49] [INFO] Convertendo /opt/airflow/data-layer/stage/airports.csv.
[02:34:50] [INFO] Arquivo convertido: airports.parquet.


In [10]:
try:
    parquet_paths = [os.path.join(stage_path, f) for f in os.listdir(stage_path) if f.endswith(".parquet")]
    if parquet_paths:
        log(f"[INFO] Visualizando um parquet convertido ({parquet_paths[0]}).")
        df_tmp = spark.read.parquet(parquet_paths[0])
        df_tmp.printSchema()
        df_tmp.show(5, truncate=False)
    else:
        log("[INFO] Nenhum arquivo parquet disponível para inspeção.")
except Exception as e:
    log(f"[ERRO] Falha ao ler Parquet: {e}")


[02:34:58] [INFO] Visualizando um parquet convertido (/opt/airflow/data-layer/stage/airlines.parquet).
root
 |-- IATA_CODE: string (nullable = true)
 |-- AIRLINE: string (nullable = true)

+---------+----------------------+
|IATA_CODE|AIRLINE               |
+---------+----------------------+
|UA       |United Air Lines Inc. |
|AA       |American Airlines Inc.|
|US       |US Airways Inc.       |
|F9       |Frontier Airlines Inc.|
|B6       |JetBlue Airways       |
+---------+----------------------+
only showing top 5 rows


# Task 4 - move_files_to_bronze

In [11]:
try:
    parquet_files = [os.path.join(stage_path, f) for f in os.listdir(stage_path) if f.endswith(".parquet")]
    if not parquet_files:
        log("[WARN] Nenhum arquivo parquet encontrado para mover.")
    else:
        processing_date = datetime.now().strftime("%Y-%m-%d")
        move_files(spark, parquet_files, bronze_path, processing_date)
        log(f"[INFO] {len(parquet_files)} arquivo(s) movidos para bronze/{processing_date}.")
except Exception as e:
    log(f"[ERRO] Falha ao mover arquivos: {e}")


[02:35:09] [INFO] 3 arquivo(s) movidos para bronze/2025-10-31.


# Task 5 - cleanup_stage

In [12]:
try:
    stage_files = [os.path.join(stage_path, f) for f in os.listdir(stage_path)]
    if not stage_files:
        log("[WARN] Nenhum arquivo para limpar.")
    else:
        delete_files(spark, stage_files)
        log(f"[INFO] {len(stage_files)} arquivo(s) removidos da stage.")
except Exception as e:
    log(f"[ERRO] Falha ao limpar stage: {e}.")


[02:35:29] [INFO] 12 arquivo(s) removidos da stage.


# Fechando sessão do spark

In [13]:
try:
    spark.stop()
    log("[INFO] Sessão spark encerrada com sucesso.")
    log(f"[INFO] Logs salvos em: {LOG_FILE}.")
except Exception as e:
    log(f"[ERRO] Falha ao encerrar Spark: {e}.")


[02:35:37] [INFO] Sessão spark encerrada com sucesso.
[02:35:37] [INFO] Logs salvos em: /opt/airflow/data-layer/logs/debug/landing/landing_debug_20251031_023029.log.
