In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql import Window as Window
import re
import time

In [0]:
%run ./../../ingestion/structs/nb_schemas

In [0]:
# Parâmetros de entrada para a tabela Silver
src_catalog_name    = 'silver'
src_schema_name     = 'refined'
src_table_name      = 'global_superstore'

# Parâmetros da camada Gold
gold_catalog_name   = 'gold'
gold_schema_name    = 'analytics'
gold_table_prefix   = 'global_superstore'

gold_parquet_tables = '/Volumes/gold/analytics/parquets_files'

In [0]:

table_path = f"{src_catalog_name}.{src_schema_name}.{src_table_name}"
spark.sql(f"CREATE CATALOG IF NOT EXISTS {gold_catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_catalog_name}.{gold_schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {gold_catalog_name}.{gold_schema_name}.parquets_files")
df = spark.read.table(table_path)

In [0]:
df = spark.read.table(table_path)

#### Evolução anual das vendas, quantidades vendidas e preço médio por transação. 

In [0]:
### Adicione uma coluna calculada, por exemplo, o valor total de cada transação.
total_order_value = (
    df
    .groupBy('order_id')
    .agg(
        f.sum(f.col('sales')).cast('decimal(12,2)').alias('total_order_value')
    )
)

(
    total_order_value
    .write
    .format('delta')
    .mode("overwrite")
    .saveAsTable(f"{gold_catalog_name}.{gold_schema_name}.{gold_table_prefix}_total_order_value")
)

time.sleep(0.1)

(
    total_order_value
    .coalesce(1)
    .write
    .format('parquet')
    .mode("overwrite")
    .save(f"{gold_parquet_tables}/total_order_value")
)


#### Consulta: Métricas Anuais por produto

In [0]:
# 1. Consulta: Métricas Anuais por produto
df_annual = (
    df
    .withColumn("year", f.year(f.col("order_date")))
    .groupBy("year", 'product_id')
    .agg(
        f.round(f.sum("sales"), 2).alias("total_sales"),
        f.sum("quantity").alias("total_quantity"),
        f.round(f.avg("sales"), 2).alias("avg_sales")
    )
    .orderBy("year")
)
    
(
    df_annual
    .write
    .mode("overwrite")
    .option('overwriteSchema', True)
    .saveAsTable(f"{gold_catalog_name}.{gold_schema_name}.{gold_table_prefix}_annual_metrics")
)

time.sleep(0.1)

(
    df_annual
    .coalesce(1)
    .write
    .format('parquet')
    .partitionBy('year')
    .mode("overwrite")
    .save(f"{gold_parquet_tables}/annual_metrics")
)

In [0]:
# Agregue os dados para obter estatísticas de vendas, por exemplo, o total de vendas por
# produto ou por categoria

df_category = (
    df
    .groupBy('category', f.substring(f.col('order_date'), 1, 7).alias('month'))
    .agg(
        f.sum("sales").alias("total_sales")
    )
)

(
    df_category
    .write
    .mode("overwrite")
    .option('overwriteSchema', True)
    .saveAsTable(f"{gold_catalog_name}.{gold_schema_name}.{gold_table_prefix}_category")
)


time.sleep(0.1)

(
    df_category
    .coalesce(1)
    .write
    .format('parquet')
    .partitionBy('month')
    .mode("overwrite")
    .save(f"{gold_parquet_tables}/category")
)