In [1]:
# Libs import
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark
from pyspark.sql.utils import AnalysisException
import os
import sys
from pathlib import Path
sys.path.append('/opt/workspace/')

In [2]:
# Changing working directory to root to use custom libs
os.chdir('/opt/workspace/')

#Spark Configurations
    # Sets Session to use spark master container
    # Sets Session to use warehouse directory in /opt/workspace/Warehouse and to infer data Schema
    # Sets overide mode to dynamic, so we can append data and overwrite old data based on partition
spark = SparkSession.builder.appName('[CURATED] Commission By Order') \
        .master("spark://spark-master:7077") \
        .config("spark.sql.streaming.schemaInference", True) \
        .config("spark.sql.warehouse.dir", '/opt/workspace/Warehouse') \
        .enableHiveSupport() \
        .config("spark.sql.sources.partitionOverwriteMode", 'dynamic') \
        .getOrCreate()


In [5]:
df = spark.sql(
    """
    select 
        o.id_pedido,
        o.order_partner_value * 6 * 0.01 as order_commission,
        o.id_parceiro,
        o.partition
    from curated.normalized_orders o
        join raw.categoria c on c.id_categoria = o.categoria
""")

df.write.partitionBy('partition').format('parquet').mode('overwrite').saveAsTable('curated.orders_commission')

In [6]:
spark.sql('select * from curated.orders_commission').show()

+---------+------------------+-----------+----------+
|id_pedido|   order_comission|id_parceiro| partition|
+---------+------------------+-----------+----------+
|481550820|               3.9|         16|2021-07-06|
|482017030|             13.98|         13|2021-07-06|
|481664475|               3.9|         16|2021-07-06|
|481603427|              9.36|         16|2021-07-06|
|482079090|               3.9|         16|2021-07-06|
|481701312|              78.0|          6|2021-07-06|
|481587995|             16.38|         13|2021-07-06|
|481825035|               4.2|         16|2021-07-06|
|481757630|               6.9|          6|2021-07-06|
|481921600|               4.2|         16|2021-07-06|
|481666762|350.28000000000003|         13|2021-07-06|
|481998565|               8.1|         16|2021-07-06|
|482092212|              5.82|         16|2021-07-06|
|482049545|              9.66|         16|2021-07-06|
|482085090|               4.2|         16|2021-07-06|
|481517390|              4.3

In [7]:
# Stops spark client and finishes the job
spark.stop()