# 02_serving_load_gold_to_postgres_job
---
Este job executa a carga das tabelas da camada **Gold** no PostgreSQL (schema `gold`).


In [2]:
# Parameters

run_mode = "latest"
run_date = None

gold_path = "/opt/airflow/data-layer/gold"
postgres_conn_id = "AIRFLOW_VAR_POSTGRES_CONN_ID"


In [1]:
import os
from pathlib import Path
from datetime import datetime
from pyspark.sql import SparkSession

from transformer.utils.logger import get_logger
from transformer.utils.spark_helpers import get_spark_session, load_to_postgres
from transformer.utils.file_io import find_partition

log = get_logger("serving_load_gold_to_postgres")


spark = get_spark_session("LoadGoldToPostgres")
log.info("[Serving][Load] SparkSession iniciada.")


2025-11-09 20:35:01 [INFO] spark_helpers | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-09 20:35:01 [INFO] file_io | [INFO] Logger inicializado no modo standalone (INFO).
2025-11-09 20:35:01 [INFO] serving_load_gold_to_postgres | [INFO] Logger inicializado no modo standalone (INFO).
/usr/local/lib/python3.12/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0bb5a320-11d8-4468-86a6-635f7c244735;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 105ms :: artifacts dl 3ms
	:: modules in use:
	org.checkerframework#checker-qual;3.42.0 from central in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------

# Define função principal do job

In [3]:
def run_gold_load_job(
        spark: SparkSession,
        gold_path: str,
        postgres_conn_id: str,
        run_mode: str = "latest",
        run_date: str | None = None
) -> None:
    """
    Executa a carga das tabelas da camada Gold no schema 'gold' do PostgreSQL.

    Args:
        spark (SparkSession): Sessão Spark ativa.
        gold_path (str): Caminho base da camada Gold.
        postgres_conn_id (str): ID de conexão PostgreSQL configurado no Airflow.
        run_mode (str): 'latest' ou 'date'.
        run_date (str | None): Data específica, se aplicável.
    """
    # Localiza partição mais recente ou específica
    partition = find_partition(gold_path, mode=run_mode, date_str=run_date)
    partition_dir = Path(gold_path) / partition / "PARQUET"

    if not partition_dir.exists():
        raise FileNotFoundError(f"[Serving][Load][ERROR] Diretório de partição não encontrado: {partition_dir}")

    log.info(f"[Serving][Load] Partição selecionada: {partition_dir}.")

    # Lê os parquets
    tables = {
        "dim_airline": spark.read.parquet(str(partition_dir / "dim_airline.parquet")),
        "dim_airport": spark.read.parquet(str(partition_dir / "dim_airport.parquet")),
        "dim_date": spark.read.parquet(str(partition_dir / "dim_date.parquet")),
        "fato_flights": spark.read.parquet(str(partition_dir / "fato_flights.parquet"))
    }

    # Carga no PostgreSQL
    for table_name, df in tables.items():
        full_table_name = f"gold.{table_name}"
        log.info(f"[Serving][Load] Carregando tabela: {full_table_name}.")
        load_to_postgres(df, db_conn_id=postgres_conn_id, table_name=full_table_name, mode="overwrite")
        log.info(f"[Serving][Load] Tabela '{full_table_name}' carregada com sucesso.")

    log.info("[Serving][Load] Carga de todas as tabelas concluída com sucesso.")


In [4]:
try:
    log.info("[Serving][Load] Iniciando job de carga da gold.")
    run_gold_load_job(spark, gold_path, postgres_conn_id, run_mode, run_date)
except Exception as e:
    log.exception(f"[Serving][LOAD][ERROR] Falha durante carga no PostgreSQL: {e}")
    raise
finally:
    log.info("[Serving][Load] Job de carga da gold encerrado.")
    

2025-11-09 20:35:09 [INFO] serving_load_gold_to_postgres | [Serving][Load] Iniciando carga para o PostgreSQL.
2025-11-09 20:35:09 [INFO] serving_load_gold_to_postgres | [Serving][Load] Partição selecionada: /opt/airflow/data-layer/gold/2025-11-09/PARQUET.
2025-11-09 20:35:11 [INFO] serving_load_gold_to_postgres | [Serving][Load] Carregando tabela: gold.dim_airline.
2025-11-09 20:35:11 [WARN] spark_helpers | [WARN] Airflow indisponível, usando variáveis de ambiente para conexão PostgreSQL.
2025-11-09 20:35:12 [INFO] spark_helpers | [INFO] Limpando tabela gold.dim_airline com TRUNCATE CASCADE...
ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_de

KeyboardInterrupt: 

                                                                                

In [7]:
%%script false --no-raise-error # Comentar essa linha se estiver em debug ou se quiser rodar a célula.
# Vareficação manual da carga das tabelas
import os

jdbc_url = f"jdbc:postgresql://{os.getenv('DB_HOST', 'localhost')}:{os.getenv('DB_PORT', '5432')}/{os.getenv('DB_NAME', 'postgres')}"
connection_properties = {
    "user": os.getenv("DB_USER", "postgres"),
    "password": os.getenv("DB_PASSWORD", "postgres"),
    "driver": "org.postgresql.Driver",
}

tables_to_check = ["dim_airline", "dim_airport", "dim_date"]
for tbl in tables_to_check:
    print(f"\n+-------------------------------------------------------------- gold.{tbl} --------------------------------------------------------------+")
    df_check = spark.read.jdbc(url=jdbc_url, table=f"gold.{tbl}", properties=connection_properties)
    df_check.limit(1).show(truncate=True)
    

In [None]:
# Encerrando a sessão do Spark.
spark.stop()
log.info("[Serving] Sessão Spark finalizada.")



Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "SparkUI-47"
