# Orquestrador de Cargas


In [0]:
# Databricks notebook source
import json
import uuid
import traceback
from datetime import datetime, date
from pyspark.sql import Row, SparkSession
from pyspark.dbutils import DBUtils
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType

# Inicializa contexto Spark e DBUtils
spark = SparkSession.builder.getOrCreate()
dbutils = DBUtils(spark)

# Caminho do JSON de configura√ß√£o
json_path = "/Volumes/vitivinicultura/default/landing_zone/orquestrador_teste.json"

# Nome completo da tabela no Unity Catalog
tabela_logs = "vitivinicultura.logs.pipeline_logs"

# --------------------------------------------------------
# üîπ Fun√ß√£o auxiliar: salva log de execu√ß√£o no Unity Catalog
# --------------------------------------------------------
# ‚úÖ Define schema fixo (evita erro de infer√™ncia)
schema_log = StructType([
    StructField("log_id", StringType(), True),
    StructField("execution_id", StringType(), True),
    StructField("pipeline_name", StringType(), True),
    StructField("job_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("start_time", TimestampType(), True),
    StructField("end_time", TimestampType(), True),
    StructField("duration_sec", LongType(), True),
    StructField("data_execucao", StringType(), True),
    StructField("user", StringType(), True),
    StructField("environment", StringType(), True),
    StructField("error_message", StringType(), True),
    StructField("initial_row_count", LongType(), True),
    StructField("final_row_count", LongType(), True),
    StructField("rows_loaded", LongType(), True)
])

# --------------------------------------------------------
# üîπ Fun√ß√£o auxiliar: salva log de execu√ß√£o no Unity Catalog
# --------------------------------------------------------
def salvar_log(execution_id, pipeline_name, job_name, status, start_time, end_time, data_execucao, user, environment,
               rows_before=None, rows_after=None, rows_inserted=None, error_message=None):
    duration = int((end_time - start_time).total_seconds())
    log_id = str(uuid.uuid4())
    log_data = [(log_id, execution_id, pipeline_name, job_name, status,
                 start_time, end_time, duration, data_execucao,
                 user, environment, rows_before, rows_after, rows_inserted, error_message)]
    
    df = spark.createDataFrame(log_data, schema=schema_log)
    df.write.format("delta").mode("append").saveAsTable(tabela_logs)

# --------------------------------------------------------
# üîπ Carrega configura√ß√£o JSON
# --------------------------------------------------------
with open(json_path, "r") as f:
    config = json.load(f)

pipeline_name = config["pipeline_name"]
fail_fast = config.get("fail_fast", True)
data_hoje = date.today().strftime("%Y-%m-%d")

# Substitui placeholders din√¢micos ({data_atual})
for job in config["jobs"]:
    params = job.get("params", {})
    for k, v in params.items():
        if isinstance(v, str) and "{data_atual}" in v:
            params[k] = v.replace("{data_atual}", data_hoje)
    job["params"] = params

executados = set()

# --------------------------------------------------------
# üîπ Metadados din√¢micos de execu√ß√£o
# --------------------------------------------------------
execution_id = str(uuid.uuid4())
def get_user_safe():
    try:
        return dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
    except:
        return "unknown_user"

user = get_user_safe()


environment = "dev"  # Altere para "prod" ou "test" conforme o cluster

print(f"üö¶ Iniciando pipeline: {pipeline_name}")
print(f"üÜî Execution ID: {execution_id}")
print(f"üë§ Usu√°rio: {user}")
print(f"üåé Ambiente: {environment}")

# --------------------------------------------------------
# üîπ Fun√ß√£o para executar cada job
# --------------------------------------------------------
def run_job(job_name):
    job = next(j for j in config["jobs"] if j["name"] == job_name)
    deps = job.get("depends_on", [])

    for dep in deps:
        if dep not in executados:
            run_job(dep)

    print(f"üöÄ Executando {job['name']} com params {job['params']}")
    start_time = datetime.now()
    status = "OK"
    error_message = None
    initial_row_count = None
    final_row_count = None

    # üîπ Detecta se o job referencia uma tabela de destino
    tabela_destino = job.get("target_table", None)

    # Se existir tabela de destino, captura a contagem inicial
    if tabela_destino:
        try:
            initial_row_count = spark.table(tabela_destino).count()
        except:
            initial_row_count = 0

    try:
        dbutils.notebook.run(job["path"], 0, job["params"])

        # Se tiver tabela destino, conta ap√≥s a carga
        if tabela_destino:
            final_row_count = spark.table(tabela_destino).count()

    except Exception as e:
        status = "ERROR"
        error_message = str(e)[:1000]
        print(f"‚ùå Falha em {job['name']}: {error_message}")
        if fail_fast:
            end_time = datetime.now()
            salvar_log(execution_id, pipeline_name, job["name"], status, start_time, end_time,
                       job["params"]["data_execucao"], user, environment, error_message,
                       initial_row_count, final_row_count)
            raise e
    finally:
        end_time = datetime.now()
        salvar_log(execution_id, pipeline_name, job["name"], status, start_time, end_time,
                   job["params"]["data_execucao"], user, environment, error_message,
                   initial_row_count, final_row_count)
        executados.add(job_name)
        print(f"‚úÖ Finalizado {job['name']} com status {status}")

# --------------------------------------------------------
# üîπ Execu√ß√£o principal do pipeline
# --------------------------------------------------------
for job in config["jobs"]:
    if job["name"] not in executados:
        run_job(job["name"])

print(f"üèÅ Pipeline {pipeline_name} finalizado com sucesso.")


In [0]:
%sql
SELECT * FROM vitivinicultura.logs.pipeline_logs
