# AWS Glue e Athena

Este notebook cobre:
- AWS Glue Catalog
- AWS Glue ETL Jobs
- Athena com Python
- Integração Spark + Glue Catalog

## 1. AWS Glue Catalog com Boto3

In [None]:
import boto3

# Cliente Glue
glue_client = boto3.client('glue', region_name='us-east-1')

# Listar databases
response = glue_client.get_databases()
for db in response['DatabaseList']:
    print(f"Database: {db['Name']}")

In [None]:
# Criar database
glue_client.create_database(
    DatabaseInput={
        'Name': 'meu_database',
        'Description': 'Database para estudos'
    }
)

In [None]:
# Criar tabela no Glue Catalog
glue_client.create_table(
    DatabaseName='meu_database',
    TableInput={
        'Name': 'vendas',
        'Description': 'Tabela de vendas',
        'StorageDescriptor': {
            'Columns': [
                {'Name': 'id', 'Type': 'bigint'},
                {'Name': 'produto', 'Type': 'string'},
                {'Name': 'valor', 'Type': 'double'},
                {'Name': 'quantidade', 'Type': 'int'},
            ],
            'Location': 's3://my-bucket/data/vendas/',
            'InputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',
            'OutputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',
            'SerdeInfo': {
                'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
            },
            'Compressed': True
        },
        'PartitionKeys': [
            {'Name': 'ano', 'Type': 'int'},
            {'Name': 'mes', 'Type': 'int'}
        ],
        'TableType': 'EXTERNAL_TABLE',
        'Parameters': {
            'classification': 'parquet',
            'parquet.compression': 'SNAPPY'
        }
    }
)

In [None]:
# Listar tabelas
response = glue_client.get_tables(DatabaseName='meu_database')
for table in response['TableList']:
    print(f"Tabela: {table['Name']}")
    print(f"  Location: {table['StorageDescriptor']['Location']}")

In [None]:
# Adicionar partição
glue_client.create_partition(
    DatabaseName='meu_database',
    TableName='vendas',
    PartitionInput={
        'Values': ['2024', '01'],
        'StorageDescriptor': {
            'Columns': [
                {'Name': 'id', 'Type': 'bigint'},
                {'Name': 'produto', 'Type': 'string'},
                {'Name': 'valor', 'Type': 'double'},
                {'Name': 'quantidade', 'Type': 'int'},
            ],
            'Location': 's3://my-bucket/data/vendas/ano=2024/mes=01/',
            'InputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',
            'OutputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',
            'SerdeInfo': {
                'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
            }
        }
    }
)

In [None]:
# MSCK Repair - Descobrir partições automaticamente
# Isso é feito via Athena ou Spark, não diretamente no Glue
# Use: ALTER TABLE vendas RECOVER PARTITIONS

## 2. AWS Glue Crawler

In [None]:
# Criar Crawler
glue_client.create_crawler(
    Name='vendas-crawler',
    Role='arn:aws:iam::123456789:role/GlueServiceRole',
    DatabaseName='meu_database',
    Targets={
        'S3Targets': [
            {
                'Path': 's3://my-bucket/data/vendas/',
                'Exclusions': ['*.tmp', '_SUCCESS']
            }
        ]
    },
    SchemaChangePolicy={
        'UpdateBehavior': 'UPDATE_IN_DATABASE',
        'DeleteBehavior': 'LOG'
    },
    Configuration='''{
        "Version": 1.0,
        "CrawlerOutput": {
            "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" }
        }
    }'''
)

In [None]:
# Iniciar Crawler
glue_client.start_crawler(Name='vendas-crawler')

# Verificar status
response = glue_client.get_crawler(Name='vendas-crawler')
print(f"Status: {response['Crawler']['State']}")

## 3. AWS Glue ETL Job (Script)

In [None]:
# Este é um exemplo de script Glue ETL
# Salve como .py e faça upload para S3

glue_etl_script = '''
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame

# Argumentos
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'input_path', 'output_path'])

# Contexto
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# ============================================================================
# LEITURA - Glue Catalog
# ============================================================================
datasource = glueContext.create_dynamic_frame.from_catalog(
    database="meu_database",
    table_name="vendas",
    transformation_ctx="datasource"
)

# Ou de S3 diretamente
# datasource = glueContext.create_dynamic_frame.from_options(
#     connection_type="s3",
#     connection_options={"paths": [args['input_path']]},
#     format="parquet"
# )

# ============================================================================
# TRANSFORMAÇÕES
# ============================================================================

# Converter para DataFrame Spark
df = datasource.toDF()

# Transformações PySpark normais
from pyspark.sql.functions import col, year, month

df_transformed = df \
    .filter(col("valor") > 0) \
    .withColumn("valor_total", col("valor") * col("quantidade")) \
    .withColumn("ano", year("data_venda")) \
    .withColumn("mes", month("data_venda"))

# Voltar para DynamicFrame
dynamic_frame = DynamicFrame.fromDF(df_transformed, glueContext, "dynamic_frame")

# ============================================================================
# TRANSFORMAÇÕES GLUE ESPECÍFICAS
# ============================================================================

# Apply Mapping - renomear e mudar tipos
mapped = ApplyMapping.apply(
    frame=dynamic_frame,
    mappings=[
        ("id", "long", "id", "long"),
        ("produto", "string", "nome_produto", "string"),
        ("valor_total", "double", "valor_total", "double"),
        ("ano", "int", "ano", "int"),
        ("mes", "int", "mes", "int")
    ]
)

# Drop Null Fields
cleaned = DropNullFields.apply(frame=mapped)

# ============================================================================
# ESCRITA
# ============================================================================

# Escrever no Glue Catalog
glueContext.write_dynamic_frame.from_catalog(
    frame=cleaned,
    database="meu_database",
    table_name="vendas_processed",
    transformation_ctx="write"
)

# Ou escrever em S3
glueContext.write_dynamic_frame.from_options(
    frame=cleaned,
    connection_type="s3",
    connection_options={
        "path": args['output_path'],
        "partitionKeys": ["ano", "mes"]
    },
    format="parquet",
    format_options={"compression": "snappy"}
)

job.commit()
'''

print(glue_etl_script)

In [None]:
# Criar Glue Job
glue_client.create_job(
    Name='vendas-etl-job',
    Role='arn:aws:iam::123456789:role/GlueServiceRole',
    Command={
        'Name': 'glueetl',
        'ScriptLocation': 's3://my-bucket/scripts/vendas_etl.py',
        'PythonVersion': '3'
    },
    DefaultArguments={
        '--job-language': 'python',
        '--job-bookmark-option': 'job-bookmark-enable',
        '--enable-metrics': '',
        '--enable-continuous-cloudwatch-log': 'true',
        '--input_path': 's3://my-bucket/raw/',
        '--output_path': 's3://my-bucket/processed/'
    },
    MaxRetries=1,
    GlueVersion='4.0',
    NumberOfWorkers=2,
    WorkerType='G.1X'
)

In [None]:
# Executar job
response = glue_client.start_job_run(
    JobName='vendas-etl-job',
    Arguments={
        '--input_path': 's3://my-bucket/raw/2024/',
        '--output_path': 's3://my-bucket/processed/2024/'
    }
)
print(f"Job Run ID: {response['JobRunId']}")

## 4. AWS Athena com Python

In [None]:
import boto3
import time

athena_client = boto3.client('athena', region_name='us-east-1')

def run_athena_query(query, database, output_location):
    """Executa query no Athena e aguarda resultado"""
    
    # Iniciar query
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={'OutputLocation': output_location}
    )
    
    query_execution_id = response['QueryExecutionId']
    print(f"Query ID: {query_execution_id}")
    
    # Aguardar conclusão
    while True:
        result = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        state = result['QueryExecution']['Status']['State']
        
        if state == 'SUCCEEDED':
            print("Query concluída!")
            break
        elif state in ['FAILED', 'CANCELLED']:
            reason = result['QueryExecution']['Status'].get('StateChangeReason', 'Unknown')
            raise Exception(f"Query {state}: {reason}")
        
        time.sleep(1)
    
    # Obter resultados
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)
    return results

In [None]:
# Executar query
query = """
    SELECT 
        produto,
        SUM(valor * quantidade) as total_vendas,
        COUNT(*) as qtd_vendas
    FROM vendas
    WHERE ano = 2024
    GROUP BY produto
    ORDER BY total_vendas DESC
    LIMIT 10
"""

results = run_athena_query(
    query=query,
    database='meu_database',
    output_location='s3://my-bucket/athena-results/'
)

# Processar resultados
for row in results['ResultSet']['Rows'][1:]:  # Skip header
    values = [col.get('VarCharValue', '') for col in row['Data']]
    print(values)

In [None]:
# Usando PyAthena (mais pythonic)
# pip install pyathena

from pyathena import connect
import pandas as pd

conn = connect(
    s3_staging_dir='s3://my-bucket/athena-results/',
    region_name='us-east-1'
)

# Query direta para pandas
df = pd.read_sql("""
    SELECT * FROM meu_database.vendas
    WHERE ano = 2024
    LIMIT 100
""", conn)

print(df.head())

## 5. Spark + Glue Catalog

In [None]:
from pyspark.sql import SparkSession

# Spark configurado para usar Glue Catalog
spark = SparkSession.builder \
    .appName("SparkGlueCatalog") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "org.apache.spark:spark-hive_2.12:3.5.0") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.hive.metastore.client.factory.class",
            "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
# Listar databases do Glue
spark.sql("SHOW DATABASES").show()

# Usar database
spark.sql("USE meu_database")

# Listar tabelas
spark.sql("SHOW TABLES").show()

In [None]:
# Ler tabela do Glue Catalog
df = spark.table("meu_database.vendas")
df.show()

# Ou via SQL
df = spark.sql("""
    SELECT * FROM meu_database.vendas
    WHERE ano = 2024 AND mes = 1
""")
df.show()

In [None]:
# Escrever tabela no Glue Catalog
df_processed = df.filter(df.valor > 100)

df_processed.write \
    .mode("overwrite") \
    .format("parquet") \
    .partitionBy("ano", "mes") \
    .saveAsTable("meu_database.vendas_filtered")

In [None]:
# Atualizar partições (MSCK REPAIR)
spark.sql("MSCK REPAIR TABLE meu_database.vendas")

## 6. Queries SQL Úteis no Athena

In [None]:
queries_athena = '''
-- ============================================================================
-- CRIAR TABELA EXTERNA
-- ============================================================================
CREATE EXTERNAL TABLE IF NOT EXISTS vendas (
    id BIGINT,
    produto STRING,
    valor DOUBLE,
    quantidade INT
)
PARTITIONED BY (ano INT, mes INT)
STORED AS PARQUET
LOCATION 's3://my-bucket/data/vendas/'
TBLPROPERTIES ('parquet.compression'='SNAPPY');

-- ============================================================================
-- DESCOBRIR PARTIÇÕES
-- ============================================================================
MSCK REPAIR TABLE vendas;

-- Ou adicionar manualmente
ALTER TABLE vendas ADD PARTITION (ano=2024, mes=1)
LOCATION 's3://my-bucket/data/vendas/ano=2024/mes=1/';

-- ============================================================================
-- CTAS - Create Table As Select
-- ============================================================================
CREATE TABLE vendas_2024
WITH (
    format = 'PARQUET',
    parquet_compression = 'SNAPPY',
    external_location = 's3://my-bucket/processed/vendas_2024/'
) AS
SELECT * FROM vendas WHERE ano = 2024;

-- ============================================================================
-- INSERT INTO
-- ============================================================================
INSERT INTO vendas_processed
SELECT 
    id,
    produto,
    valor * quantidade as total,
    ano,
    mes
FROM vendas
WHERE ano = 2024;

-- ============================================================================
-- OPTIMIZE (Iceberg)
-- ============================================================================
-- OPTIMIZE vendas REWRITE DATA USING BIN_PACK;
-- VACUUM vendas;

-- ============================================================================
-- QUERIES DE ANÁLISE
-- ============================================================================
-- Top produtos
SELECT 
    produto,
    SUM(valor * quantidade) as total_vendas
FROM vendas
GROUP BY produto
ORDER BY total_vendas DESC
LIMIT 10;

-- Vendas por mês
SELECT 
    ano,
    mes,
    SUM(valor * quantidade) as total
FROM vendas
GROUP BY ano, mes
ORDER BY ano, mes;

-- Window function
SELECT 
    produto,
    mes,
    SUM(valor) as total_mes,
    SUM(SUM(valor)) OVER (PARTITION BY produto ORDER BY mes) as acumulado
FROM vendas
GROUP BY produto, mes;
'''

print(queries_athena)