In [0]:
# MAGIC %md
# MAGIC # NYC Taxi Pipeline - Databricks Workflows Orchestration
# MAGIC ### Stack Tecnologias - Desafio Técnico
# MAGIC 
# MAGIC **Objetivo**: Criar pipeline orquestrado usando Databricks Workflows
# MAGIC 
# MAGIC **Componentes:**
# MAGIC 1. Job de ingestão Bronze
# MAGIC 2. Job de transformação Silver
# MAGIC 3. Job de agregação Gold
# MAGIC 4. Monitoramento e alertas
# MAGIC 5. Configuração de schedules

In [0]:
# COMMAND ----------
import requests
import json
from datetime import datetime

# Configurações do Workflow
workflow_config = {
    "name": "NYC_Taxi_Pipeline_Production",
    "description": "Pipeline completo Bronze → Silver → Gold para dados NYC Taxi",
    "max_concurrent_runs": 1,
    "timeout_seconds": 7200,  # 2 horas
    "email_notifications": {
        "on_failure": ["lucaslovatotech@gmail.com"],
        "on_success": ["lucaslovatotech@gmail.com"]
    }
}

print("🔧 Configurações do Workflow definidas")

🔧 Configurações do Workflow definidas


In [0]:
# COMMAND ----------
# Definir tasks do pipeline
pipeline_tasks = {
    "task_1_bronze_ingestion": {
        "task_key": "bronze_ingestion",
        "description": "Ingestão de dados raw para camada Bronze",
        "notebook_task": {
            "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/01_bronze_ingestion",
            "base_parameters": {
                "environment": "production",
                "date": "{{ ds }}"
            }
        },
        "new_cluster": {
            "spark_version": "13.3.x-scala2.12",
            "node_type_id": "i3.xlarge",
            "num_workers": 2,
            "spark_conf": {
                "spark.databricks.delta.preview.enabled": "true",
                "spark.sql.adaptive.enabled": "true",
                "spark.sql.adaptive.coalescePartitions.enabled": "true"
            }
        },
        "timeout_seconds": 3600
    },
    
    "task_2_silver_transformation": {
        "task_key": "silver_transformation", 
        "description": "Transformação Bronze → Silver com limpeza",
        "depends_on": [{"task_key": "bronze_ingestion"}],
        "notebook_task": {
            "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/02_bronze_to_silver_etl",
            "base_parameters": {
                "environment": "production",
                "date": "{{ ds }}"
            }
        },
        "new_cluster": {
            "spark_version": "13.3.x-scala2.12", 
            "node_type_id": "i3.xlarge",
            "num_workers": 4,
            "spark_conf": {
                "spark.databricks.delta.preview.enabled": "true",
                "spark.sql.adaptive.enabled": "true"
            }
        },
        "timeout_seconds": 3600
    },
    
    "task_3_gold_aggregation": {
        "task_key": "gold_aggregation",
        "description": "Agregações Silver → Gold para análises",
        "depends_on": [{"task_key": "silver_transformation"}], 
        "notebook_task": {
            "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/03_silver_to_gold_aggregation",
            "base_parameters": {
                "environment": "production",
                "date": "{{ ds }}"
            }
        },
        "new_cluster": {
            "spark_version": "13.3.x-scala2.12",
            "node_type_id": "i3.xlarge", 
            "num_workers": 3,
            "spark_conf": {
                "spark.databricks.delta.preview.enabled": "true"
            }
        },
        "timeout_seconds": 2400
    }
}

print("📋 Tasks do pipeline definidas:")
for task_name, task_config in pipeline_tasks.items():
    print(f"  ✅ {task_config['task_key']}: {task_config['description']}")

📋 Tasks do pipeline definidas:
  ✅ bronze_ingestion: Ingestão de dados raw para camada Bronze
  ✅ silver_transformation: Transformação Bronze → Silver com limpeza
  ✅ gold_aggregation: Agregações Silver → Gold para análises


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Configuração de Monitoramento e Alertas
# MAGIC 
# MAGIC Implementando monitoramento robusto para o pipeline:

In [0]:
# COMMAND ----------
# Configurar alertas e monitoramento
monitoring_config = {
    "health_checks": {
        "bronze_data_freshness": {
            "description": "Verificar se dados Bronze são atualizados nas últimas 24h",
            "query": f"""
            SELECT 
                MAX(processed_timestamp) as last_update,
                DATEDIFF(HOUR, MAX(processed_timestamp), CURRENT_TIMESTAMP()) as hours_since_update
            FROM nyc_taxi_catalog.bronze.raw_data
            """,
            "threshold": 24,
            "alert_condition": "hours_since_update > threshold"
        },
        
        "silver_quality_check": {
            "description": "Verificar taxa de retenção Silver > 95%",
            "query": f"""
            SELECT 
                COUNT(*) as total_silver,
                (SELECT COUNT(*) FROM nyc_taxi_catalog.bronze.raw_data) as total_bronze,
                (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM nyc_taxi_catalog.bronze.raw_data)) as retention_rate
            FROM nyc_taxi_catalog.silver.nyc_taxi_trips
            WHERE DATE(processed_timestamp) = CURRENT_DATE()
            """,
            "threshold": 95.0,
            "alert_condition": "retention_rate < threshold"
        },
        
        "gold_completeness": {
            "description": "Verificar se agregações Gold foram atualizadas",
            "query": f"""
            SELECT COUNT(*) as daily_records
            FROM nyc_taxi_catalog.gold.daily_revenue_metrics
            WHERE report_date = CURRENT_DATE()
            """,
            "threshold": 1,
            "alert_condition": "daily_records < threshold"
        }
    },
    
    "performance_metrics": {
        "processing_time": "Tempo total de execução do pipeline",
        "data_volume": "Volume de dados processados por camada",
        "error_rate": "Taxa de erro por job",
        "resource_utilization": "Utilização de clusters"
    }
}

print("📊 Configurações de monitoramento definidas:")
for check_name, check_config in monitoring_config["health_checks"].items():
    print(f"  🔍 {check_name}: {check_config['description']}")

📊 Configurações de monitoramento definidas:
  🔍 bronze_data_freshness: Verificar se dados Bronze são atualizados nas últimas 24h
  🔍 silver_quality_check: Verificar taxa de retenção Silver > 95%
  🔍 gold_completeness: Verificar se agregações Gold foram atualizadas


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Criação do Workflow JSON Completo
# MAGIC 
# MAGIC Gerando configuração completa para Databricks Workflows:

# COMMAND ----------
# Gerar JSON completo do workflow
def create_workflow_json():
    """
    Criar configuração completa do Databricks Workflow
    """
    workflow_json = {
        "name": workflow_config["name"],
        "description": workflow_config["description"],
        "max_concurrent_runs": workflow_config["max_concurrent_runs"],
        "timeout_seconds": workflow_config["timeout_seconds"],
        
        "email_notifications": {
            "on_start": workflow_config["email_notifications"]["on_success"],
            "on_success": workflow_config["email_notifications"]["on_success"], 
            "on_failure": workflow_config["email_notifications"]["on_failure"],
            "no_alert_for_skipped_runs": False
        },
        
        "webhook_notifications": {
            "on_start": [],
            "on_success": [],
            "on_failure": []
        },
        
        "schedule": {
            "quartz_cron_expression": "0 0 2 * * ?",  # Diário às 2:00 AM
            "timezone_id": "America/Sao_Paulo",
            "pause_status": "UNPAUSED"
        },
        
        "tasks": [
            {
                "task_key": "bronze_ingestion",
                "description": "Ingestão de dados raw para camada Bronze",
                "notebook_task": {
                    "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/01_bronze_ingestion",
                    "base_parameters": {
                        "environment": "production",
                        "execution_date": "{{ ds }}",
                        "workflow_run_id": "{{ run_id }}"
                    }
                },
                "new_cluster": {
                    "spark_version": "13.3.x-scala2.12",
                    "node_type_id": "i3.xlarge",
                    "num_workers": 2,
                    "spark_conf": {
                        "spark.databricks.delta.preview.enabled": "true",
                        "spark.sql.adaptive.enabled": "true",
                        "spark.sql.adaptive.coalescePartitions.enabled": "true",
                        "spark.sql.adaptive.skewJoin.enabled": "true"
                    },
                    "aws_attributes": {
                        "zone_id": "us-west-2a",
                        "instance_profile_arn": None,
                        "first_on_demand": 1,
                        "availability": "SPOT_WITH_FALLBACK"
                    },
                    "enable_elastic_disk": True,
                    "disk_spec": {
                        "disk_type": {
                            "ebs_volume_type": "GENERAL_PURPOSE_SSD"
                        },
                        "disk_size": 100
                    }
                },
                "timeout_seconds": 3600,
                "max_retries": 2,
                "min_retry_interval_millis": 60000,
                "retry_on_timeout": True
            },
            
            {
                "task_key": "silver_transformation",
                "description": "Transformação Bronze → Silver com limpeza e validação",
                "depends_on": [
                    {
                        "task_key": "bronze_ingestion"
                    }
                ],
                "notebook_task": {
                    "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/02_bronze_to_silver_etl",
                    "base_parameters": {
                        "environment": "production",
                        "execution_date": "{{ ds }}",
                        "workflow_run_id": "{{ run_id }}",
                        "source_table": "nyc_taxi_catalog.bronze.raw_data"
                    }
                },
                "new_cluster": {
                    "spark_version": "13.3.x-scala2.12",
                    "node_type_id": "i3.xlarge", 
                    "num_workers": 4,
                    "spark_conf": {
                        "spark.databricks.delta.preview.enabled": "true",
                        "spark.sql.adaptive.enabled": "true",
                        "spark.sql.adaptive.coalescePartitions.enabled": "true",
                        "spark.databricks.delta.autoCompact.enabled": "true",
                        "spark.databricks.delta.optimizeWrite.enabled": "true"
                    },
                    "aws_attributes": {
                        "zone_id": "us-west-2a",
                        "availability": "SPOT_WITH_FALLBACK"
                    },
                    "enable_elastic_disk": True
                },
                "timeout_seconds": 3600,
                "max_retries": 2,
                "min_retry_interval_millis": 120000
            },
            
            {
                "task_key": "gold_aggregation",
                "description": "Agregações Silver → Gold para dashboards analíticos",
                "depends_on": [
                    {
                        "task_key": "silver_transformation"
                    }
                ],
                "notebook_task": {
                    "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/03_silver_to_gold_aggregation",
                    "base_parameters": {
                        "environment": "production",
                        "execution_date": "{{ ds }}",
                        "workflow_run_id": "{{ run_id }}",
                        "source_table": "nyc_taxi_catalog.silver.nyc_taxi_trips"
                    }
                },
                "new_cluster": {
                    "spark_version": "13.3.x-scala2.12",
                    "node_type_id": "i3.xlarge",
                    "num_workers": 3,
                    "spark_conf": {
                        "spark.databricks.delta.preview.enabled": "true",
                        "spark.sql.adaptive.enabled": "true",
                        "spark.databricks.delta.autoCompact.enabled": "true"
                    },
                    "aws_attributes": {
                        "zone_id": "us-west-2a",
                        "availability": "SPOT_WITH_FALLBACK"
                    }
                },
                "timeout_seconds": 2400,
                "max_retries": 1
            },
            
            {
                "task_key": "data_quality_validation",
                "description": "Validação de qualidade e geração de relatórios",
                "depends_on": [
                    {
                        "task_key": "gold_aggregation"
                    }
                ],
                "notebook_task": {
                    "notebook_path": "/Workspace/Users/lucaslovatorocha1@gmail.com/stack-tecnico/notebooks/06_data_quality_validation",
                    "base_parameters": {
                        "environment": "production",
                        "execution_date": "{{ ds }}",
                        "workflow_run_id": "{{ run_id }}"
                    }
                },
                "new_cluster": {
                    "spark_version": "13.3.x-scala2.12",
                    "node_type_id": "i3.large",
                    "num_workers": 1,
                    "spark_conf": {
                        "spark.sql.adaptive.enabled": "true"
                    }
                },
                "timeout_seconds": 1200,
                "max_retries": 1
            }
        ],
        
        "git_source": {
            "git_url": "https://github.com/lucaslovatorocha/nyc-taxi-pipeline",
            "git_provider": "github",
            "git_branch": "main"
        },
        
        "run_as": {
            "service_principal_name": "nyc-taxi-pipeline-sp"
        }
    }
    
    return workflow_json

# Gerar e exibir configuração
workflow_definition = create_workflow_json()
print("🚀 Workflow JSON gerado com sucesso!")
print(f"📝 Nome: {workflow_definition['name']}")
print(f"📅 Schedule: {workflow_definition['schedule']['quartz_cron_expression']}")
print(f"🔧 Tasks: {len(workflow_definition['tasks'])}")

# COMMAND ----------
# Salvar configuração em arquivo
import json

workflow_json_str = json.dumps(workflow_definition, indent=2)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Script de Criação via API (Opcional)
# MAGIC 
# MAGIC Para automatizar a criação do workflow:

# COMMAND ----------
# Script para criar workflow via Databricks REST API
def create_workflow_via_api(workflow_config, databricks_host, access_token):
    """
    Criar workflow usando Databricks REST API
    """
    import requests
    
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Content-Type': 'application/json'
    }
    
    url = f"{databricks_host}/api/2.1/jobs/create"
    
    response = requests.post(url, headers=headers, json=workflow_config)
    
    if response.status_code == 200:
        job_id = response.json()['job_id']
        print(f"✅ Workflow criado com sucesso! Job ID: {job_id}")
        return job_id
    else:
        print(f"❌ Erro ao criar workflow: {response.status_code}")
        print(f"📄 Resposta: {response.text}")
        return None

# Exemplo de uso (descomente e configure para usar)
"""
DATABRICKS_HOST = "https://your-workspace.cloud.databricks.com"
ACCESS_TOKEN = "your-access-token"

job_id = create_workflow_via_api(
    workflow_definition, 
    DATABRICKS_HOST, 
    ACCESS_TOKEN
)
"""

print("🔧 Script de API disponível para automação")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Monitoramento e Alertas Avançados

# COMMAND ----------
# Configurar sistema de monitoramento
def setup_monitoring_queries():
    """
    Criar queries de monitoramento para o pipeline
    """
    monitoring_queries = {
        "pipeline_health": """
        SELECT 
            'pipeline_health' as metric_name,
            CASE 
                WHEN bronze_count > 0 AND silver_count > 0 AND gold_count > 0 THEN 'HEALTHY'
                WHEN bronze_count > 0 AND silver_count > 0 THEN 'PARTIAL'
                ELSE 'UNHEALTHY'
            END as status,
            bronze_count,
            silver_count, 
            gold_count,
            CURRENT_TIMESTAMP() as check_timestamp
        FROM (
            SELECT 
                (SELECT COUNT(*) FROM nyc_taxi_catalog.bronze.raw_data 
                 WHERE DATE(processed_timestamp) = CURRENT_DATE()) as bronze_count,
                (SELECT COUNT(*) FROM nyc_taxi_catalog.silver.nyc_taxi_trips 
                 WHERE DATE(processed_timestamp) = CURRENT_DATE()) as silver_count,
                (SELECT COUNT(*) FROM nyc_taxi_catalog.gold.daily_revenue_metrics 
                 WHERE report_date = CURRENT_DATE()) as gold_count
        )
        """,
        
        "data_quality_metrics": """
        SELECT 
            'data_quality' as metric_name,
            quality_flag,
            COUNT(*) as record_count,
            COUNT(*) * 100.0 / SUM(COUNT(*)) OVER() as percentage,
            CURRENT_TIMESTAMP() as check_timestamp
        FROM nyc_taxi_catalog.silver.nyc_taxi_trips
        WHERE DATE(processed_timestamp) = CURRENT_DATE()
        GROUP BY quality_flag
        """,
        
        "processing_performance": """
        SELECT 
            'processing_performance' as metric_name,
            AVG(trip_duration_minutes) as avg_trip_duration,
            AVG(total_amount) as avg_revenue,
            COUNT(*) as total_trips,
            MAX(processed_timestamp) as last_processed,
            CURRENT_TIMESTAMP() as check_timestamp
        FROM nyc_taxi_catalog.silver.nyc_taxi_trips
        WHERE DATE(processed_timestamp) = CURRENT_DATE()
        """
    }
    
    return monitoring_queries

monitoring_queries = setup_monitoring_queries()

print("📊 Queries de monitoramento configuradas:")
for query_name in monitoring_queries.keys():
    print(f"  🔍 {query_name}")

# COMMAND ----------
# Teste das queries de monitoramento
print("🧪 Testando queries de monitoramento...")

for query_name, query_sql in monitoring_queries.items():
    try:
        print(f"\n📋 Executando: {query_name}")
        result = spark.sql(query_sql)
        result.show(5)
        print(f"✅ {query_name}: OK")
    except Exception as e:
        print(f"❌ {query_name}: ERRO - {str(e)}")

print("\n🎉 Configuração de Workflows e Monitoramento Completa!")

🚀 Workflow JSON gerado com sucesso!
📝 Nome: NYC_Taxi_Pipeline_Production
📅 Schedule: 0 0 2 * * ?
🔧 Tasks: 4
🔧 Script de API disponível para automação
📊 Queries de monitoramento configuradas:
  🔍 pipeline_health
  🔍 data_quality_metrics
  🔍 processing_performance
🧪 Testando queries de monitoramento...

📋 Executando: pipeline_health
❌ pipeline_health: ERRO - [TABLE_OR_VIEW_NOT_FOUND] The table or view `nyc_taxi_catalog`.`bronze`.`raw_data` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS. SQLSTATE: 42P01; line 15 pos 38;
'Project [pipeline_health AS metric_name#9950, CASE WHEN ((('bronze_count > 0) AND ('silver_count > 0)) AND ('gold_count > 0)) THEN HEALTHY WHEN (('bronze_count > 0) AND ('silver_count > 0)) THEN PARTIAL ELSE UNHEALTHY END AS 