In [1]:
# Libs import
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark
from pyspark.sql.utils import AnalysisException
import os
import sys
from pathlib import Path
sys.path.append('/opt/workspace/')

In [2]:
# Changing working directory to root to use custom libs
os.chdir('/opt/workspace/')

#Spark Configurations
    # Sets Session to use spark master container
    # Sets Session to use warehouse directory in /opt/workspace/Warehouse and to infer data Schema
    # Sets overide mode to dynamic, so we can append data and overwrite old data based on partition
spark = SparkSession.builder.appName('[CURATED] Orders') \
        .master("spark://spark-master:7077") \
        .config("spark.sql.streaming.schemaInference", True) \
        .config("spark.sql.warehouse.dir", '/opt/workspace/Warehouse') \
        .enableHiveSupport() \
        .config("spark.sql.sources.partitionOverwriteMode", 'dynamic') \
        .getOrCreate()


In [None]:
# This is the original query I created in case id_pedido was an PK. In this case, I'm building an array collumn
# to aggregate all the data from the items of an order in a single row, keeping the id_pedido column as a PK

df = spark.sql(
    """
    select 
        o.order_unique_id,
        o.id_pedido,
        o.id_parceiro,
        o.id_cliente,
        o.id_filial,
        f.id_cidade,
        e.id_estado,
        c.id_categoria as categoria,
        sc.id_subcategoria as subcategoria,
        o.partition_date,
        collect_list(map('id_produto', o.id_produto,
                        'qtd', o.quantidade,
                        'vr_unitario', o.vr_unitario)) as order_partner_items,
        round(o.vr_total_pago) as order_partner_value
        from curated.normalized_orders o
            join raw.produto p on p.id_produto = o.id_produto
            join raw.subcategoria sc on sc.id_subcategoria = p.id_subcategoria
            join raw.categoria c on c.id_categoria = sc.id_categoria
            join raw.filial f on f.id_filial = o.id_filial
            join raw.cidade ci on ci.id_cidade = f.id_cidade
            join raw.estado e on e.id_estado = ci.id_estado
        group by o.order_unique_id, o.id_pedido, o.id_parceiro, o.id_cliente, o.id_filial, f.id_cidade, e.id_estado, o.partition_date,
        c.id_categoria, sc.id_subcategoria, vr_total_pago
""")

df.write.partitionBy('partition_date').format('parquet').mode('overwrite').saveAsTable('curated.orders')

In [None]:
spark.sql('select * from curated.orders limit 20').show()

In [12]:
# Stops spark client and finishes the job
spark.stop()