In [1]:
from pyspark.sql import SparkSession
import os
from utils import encrypt

In [2]:
spark = SparkSession.builder.appName('trusted_order') \
        .master("local[*]") \
        .config("spark.sql.streaming.schemaInference", True) \
        .config("spark.sql.warehouse.dir", '/opt/workspace/') \
        .config("spark.sql.sources.partitionOverwriteMode", 'dynamic') \
        .enableHiveSupport() \
        .getOrCreate()

In [3]:
spark.udf.register("encrypt", encrypt)

<function utils.encrypt(string)>

In [79]:
def validate(df):
    total_count = df.select('order_id').count()
    distinct_count = df.select('order_id').drop_duplicates().count()

    if total_count != distinct_count: raise ValueError('Some order IDs are not unique!')
    print('Valid df')

In [82]:
trusted_order_query = spark.sql("""

with last_order_status_event_timestamp as (select max(to_timestamp(created_at)) as last_event_date, order_id from raw.status group by 2),

last_order_status as (
    select 
        min(s.created_at) as event_date
        , s.order_id
        , s.value
    from raw.status s 
        join last_order_status_event_timestamp loset on loset.last_event_date = s.created_at
            and loset.order_id = s.order_id
    group by 2,3),

hashed_costumer_data as (
    select 
        encrypt(c.customer_name) as hashed_customer_name
        , encrypt(c.customer_phone_number) as hashed_customer_phone_number
        , c.customer_phone_area
        , c.active as customer_current_state
        , c.created_at as customer_created_at
        , c.customer_id
        , c.language as customer_language
    from raw.consumer c
),

hashed_customer_cpf as (
    select
        encrypt(first(cpf)) as hashed_customer_cpf
        , customer_id
    from raw.order
    group by 2
),

deduplicated_raw_order as (
    select
        first(o.cpf) as cpf
        , min(o.order_created_at) as order_created_at
        , from_utc_timestamp(to_timestamp(min(order_created_at)), merchant_timezone) as local_order_timestamp
        , delivery_address_city
        , o.delivery_address_country
        , o.delivery_address_district
        , o.delivery_address_external_id
        , o.delivery_address_latitude
        , o.delivery_address_longitude
        , o.delivery_address_state
        , o.delivery_address_zip_code
        , o.merchant_latitude
        , o.merchant_longitude
        , o.merchant_timezone
        , o.order_id
        , o.order_scheduled
        , o.order_scheduled_date
        , o.order_total_amount
        , o.origin_platform
        , o.customer_id
        , o.merchant_id
    from raw.order o 
    group by 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
)
select

    dro.*
    , hcpf.hashed_customer_cpf
    , hc.hashed_customer_phone_number
    , hc.customer_phone_area
    , hc.customer_current_state
    , hc.customer_created_at
    , hc.customer_language
    , hc.hashed_customer_name
    , r.created_at as merchant_created_at
    , r.enabled as merchant_current_state
    , r.price_range as merchant_price_range
    , r.average_ticket merchant_average_ticket
    , r.takeout_time as merchant_takeout_time
    , r.delivery_time as merchant_delivery_time
    , r.minimum_order_value as merchant_minimum_order_value
    , r.merchant_zip_code
    , r.merchant_city
    , r.merchant_state
    , r.merchant_country
    , coalesce(los.value, 'unknwon') as order_last_state
    , to_date(from_utc_timestamp(to_timestamp(dro.local_order_timestamp), merchant_timezone)) as local_merchant_date_partition
from deduplicated_raw_order dro
    join hashed_costumer_data hc on hc.customer_id = dro.customer_id
    join hashed_customer_cpf hcpf on hcpf.customer_id = dro.customer_id
    join raw.restaurant r on r.id = dro.merchant_id
    left join last_order_status los on los.order_id = dro.order_id
""")

validate(trusted_order_query)

trusted_order_query.write.partitionBy('local_merchant_date_partition').format('parquet').mode('overwrite').saveAsTable('trusted.orders')

2435488
2435488
Valid df


In [None]:
spark.stop()