# ABInBev Case - Interactive Queries

Use this notebook to query the data processed by the pipeline. Tables are loaded as temporary views.

In [None]:
import os
from pyspark.sql import SparkSession

# Configurar SparkSession com Delta Lake
spark = (SparkSession.builder
    .appName("ABInBev_Interactive_Query")
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate())

# Caminho base dos dados (assumindo execucao na raiz do projeto ou ajustando path)
DATA_DIR = "../data" if os.path.exists("../data") else "data"

print(f"Spark Version: {spark.version}")
print(f"Data Directory: {DATA_DIR}")

## 1. Load Tables
Loading tables from Silver, Gold, and Consumption layers.

In [None]:
# Silver
spark.read.format("delta").load(f"{DATA_DIR}/silver/silver_sales_enriched").createOrReplaceTempView("silver_sales")
spark.read.format("delta").load(f"{DATA_DIR}/silver/silver_channel_features").createOrReplaceTempView("silver_channels")

# Gold
spark.read.format("delta").load(f"{DATA_DIR}/gold/gold_sales_enriched").createOrReplaceTempView("gold_sales")

# Consumption (Dimensions & Facts)
spark.read.format("delta").load(f"{DATA_DIR}/consumption/dim_date").createOrReplaceTempView("dim_date")
spark.read.format("delta").load(f"{DATA_DIR}/consumption/dim_product").createOrReplaceTempView("dim_product")
spark.read.format("delta").load(f"{DATA_DIR}/consumption/dim_region").createOrReplaceTempView("dim_region")
spark.read.format("delta").load(f"{DATA_DIR}/consumption/dim_channel").createOrReplaceTempView("dim_channel")
spark.read.format("delta").load(f"{DATA_DIR}/consumption/fact_sales").createOrReplaceTempView("fact_sales")

print("Tables loaded and registered as Temp Views!")

## 2. Business Questions
### 2.1 Top 3 Trade Groups by Region

In [None]:
query_1 = """
SELECT 
    r.region_name,
    c.trade_group_desc,
    SUM(f.dollar_volume) as total_dollar_volume
FROM fact_sales f
JOIN dim_region r ON f.region_key = r.region_key
JOIN dim_channel c ON f.channel_key = c.channel_key
GROUP BY r.region_name, c.trade_group_desc
ORDER BY r.region_name, total_dollar_volume DESC
"""
spark.sql(query_1).show()

### 2.2 Sales by Brand per Month

In [None]:
query_2 = """
SELECT 
    p.brand_nm,
    d.year,
    d.month,
    SUM(f.dollar_volume) as total_volume
FROM fact_sales f
JOIN dim_product p ON f.product_key = p.product_key
JOIN dim_date d ON f.date_key = d.date_key
GROUP BY p.brand_nm, d.year, d.month
ORDER BY p.brand_nm, d.year, d.month
"""
spark.sql(query_2).show()