<img src="https://github.com/mousastech/medallion/blob/fd1da67c7e3e3829e0ea84fc51c6c79a02e408da/imgs/Medallion.png?raw=true">

#Medallion Architecture
With Unity Catalog

<img src="https://github.com/mousastech/medallion/blob/92d8750f657288477d48ba7e07ac8c8340d49cf3/imgs/architecture.png?raw=true">

[Reference](https://www.databricks.com/glossary/medallion-architecture)

In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType

In [0]:
# Point out the external location mapped before
# Unity Catalog manages all permissions 

catalog = "ecommerce"
schema = "silver"

#Loading Silver layer
Filtered, Cleaned, and Augmented data from Bronze


In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_customersSilver = spark.read.table("ecommerce.bbronze.customers_bronze")

# Add a timestamp column
df_customersSilver = df_customersSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.customers_silver"

df_customersSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_geolocationSilver = spark.read.table("ecommerce.bbronze.geolocation_bronze")

# Add a timestamp column
df_geolocationSilver = df_geolocationSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.geolocation_silver"

df_geolocationSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
# This process avoid duplicates
query = f'''
   SELECT
        customer_id,
        customer_unique_id,
        customer_zip_code_prefix,
        customer_city,
        customer_state,
        CAST(date_load AS TIMESTAMP) AS date_load
   FROM
      (
        SELECT 
            DENSE_RANK() OVER(ORDER BY date_load DESC) AS rank, * 
        FROM ecommerce.bbronze.customers_bronze
      ) AS C
   WHERE
        C.rank = 1
'''

# Execute the SQL query and create a DataFrame
customersSilver = spark.sql(query)

table_name = f"{catalog}.{schema}.customers_silver"

df_customersSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_order_itemsSilver = spark.read.table("ecommerce.bbronze.order_items_bronze")

# Add a timestamp column
df_order_itemsSilver = df_order_itemsSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.order_items_silver"

df_order_itemsSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_order_paymentsSilver = spark.read.table("ecommerce.bbronze.order_payments_bronze")

# Add a timestamp column
df_order_paymentsSilver = df_order_paymentsSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.order_payments_silver"

df_order_paymentsSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)


In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_order_reviewsSilver = spark.read.table("ecommerce.bbronze.order_reviews_bronze")

# Add a timestamp column
df_order_reviewsSilver = df_order_reviewsSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.order_reviews_silver"

df_order_reviewsSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_ordersSilver = spark.read.table("ecommerce.bbronze.orders_bronze")

# Add a timestamp column
df_ordersSilver = df_ordersSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.orders_silver"

df_ordersSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_productsSilver = spark.read.table("ecommerce.bbronze.products_bronze")

# Add a timestamp column
df_productsSilver = df_productsSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.products_silver"

df_productsSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import current_timestamp

# Read the table into a DataFrame
df_sellersSilver = spark.read.table("ecommerce.bbronze.sellers_bronze")

# Add a timestamp column
df_sellersSilver = df_sellersSilver.withColumn("date_load", current_timestamp())

# Write the DataFrame to a Delta table in the silver schema with schema merge enabled
table_name = f"{catalog}.{schema}.sellers_silver"

df_sellersSilver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table_name)

In [0]:
%sql

-- Criando a camada Gold, gerando uma tabela juntando todos os dados

CREATE OR REPLACE TABLE tutorial.original.sales
USING DELTA PARTITIONED BY (estadoCliente) 
(
  SELECT
    CASE
      WHEN orders.orderStatus = 'shipped' THEN 'enviado'
      WHEN orders.orderStatus = 'canceled' THEN 'cancelado'
      WHEN orders.orderStatus = 'invoiced' THEN 'faturado'
      WHEN orders.orderStatus = 'created' THEN 'criado'
      WHEN orders.orderStatus = 'delivered' THEN 'entregue'
      WHEN orders.orderStatus = 'unavailable' THEN 'indisponível'
      WHEN orders.orderStatus = 'processing' THEN 'em processamento'
      WHEN orders.orderStatus = 'approved' THEN 'aprovado'
    END AS statusDoPedido,
    orders.orderPurchaseTimestamp AS horaCompraPedido,
    orders.orderApprovedAt AS horaPedidoAprovado,
    orders.orderEstimatedDeliveryDate AS dataEstimadaEntrega,
    DATEDIFF(
      orders.orderEstimatedDeliveryDate,
      orders.orderApprovedAt
    ) AS dataEntregaEmDias,
    order_reviews.reviewScore AS notaProduto,
    order_reviews.reviewAnswerTimestamp AS dataComentarioSobreProduto,
    CASE
      WHEN order_payments.paymentType = 'credit_card' THEN 'cartao_de_credito'
      WHEN order_payments.paymentType = 'boleto' THEN 'boleto'
      WHEN order_payments.paymentType = 'not_defined' THEN 'não_definido'
      WHEN order_payments.paymentType = 'voucher' THEN 'voucher'
      WHEN order_payments.paymentType = 'debit_card' THEN 'cartao_de_debito'
    END AS meioDePagamento,
    order_payments.paymentInstallments AS parcelamento,
    order_payments.paymentValue AS valorPago,
    customers.customerCity AS cidadeCliente,
    customers.customerState AS estadoCliente
  FROM
    tutorial.original.orders
    LEFT JOIN tutorial.original.order_payments
        order_payments ON order_payments.orderId = orders.orderId
    LEFT JOIN tutorial.original.order_reviews 
        order_reviews ON order_reviews.orderId = orders.orderId
    LEFT JOIN tutorial.original.customers
        customers ON customers.customerId = orders.customerId
)

In [0]:
%sql
SELECT * FROM tutorial.original.sales LIMIT 10

# Visualizações

In [0]:
%sql

SELECT
  estadoCliente AS estados,
  meioDePagamento AS `meio de pagamento`,
  count(*) AS `percentual`
FROM
  tutorial.original.sales
WHERE
  meioDePagamento IS NOT NULL
  AND YEAR(horaPedidoAprovado) IS NOT NULL
  AND statusDoPedido = "entregue"
GROUP BY
  estadoCliente,
  meioDePagamento

Databricks visualization. Run in Databricks to view.

In [0]:
%sql 

SELECT
  T.estadoCliente AS estados,
  T.diasEntrega AS `média de dias para entrega de produto`
FROM
  (
    SELECT
      estadoCliente,
      ROUND(AVG(dataEntregaEmDias), 0) AS DiasEntrega
    FROM
      tutorial.original.sales
    WHERE
      meioDePagamento IS NOT NULL
      AND YEAR(horaPedidoAprovado) IS NOT NULL
      AND statusDoPedido <> "cancelado"
    GROUP BY
      estadoCliente
  ) AS T

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT
    estadoCliente as `Estado`,
    Year(horaPedidoAprovado) as `Ano`,
    Count(*) as `Numero de Vendas`
FROM
    tutorial.original.sales
WHERE
    statusDoPedido = "entregue"
And
    Year(horaPedidoAprovado) = "2018"
GROUP BY
    Estado, Ano


Databricks visualization. Run in Databricks to view.

In [0]:
%sql

SELECT
  ROUND(SUM(valorPago)/Count(*), 2) as `Ticket Médio`,
  Month(horaPedidoAprovado)
FROM
  tutorial.original.sales
WHERE
  statusDoPedido = "entregue" AND Year(horaPedidoAprovado) = "2017"
GROUP BY
  Month(horaPedidoAprovado)

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT
            customerId,
            customerUniqueId,
            customerZipCodePrefix,
            customerCity,
            customerState,
            CAST(DataRawLoad AS TIMESTAMP) AS DataRawLoad
       FROM
          (
            SELECT 
                DENSE_RANK() OVER(ORDER BY DataRawLoad DESC) AS rank, * 
            FROM tutorial.original.customers
          ) AS C
       WHERE
            C.rank = 1

In [0]:
%sql
SELECT
            orderId,
            orderItemId,
            productId,
            sellerId,
            shippingLimitDate,
            price,
            freightValue,
            CAST(DataRawLoad AS TIMESTAMP) AS DataRawLoad
       FROM
          (
            SELECT 
                DENSE_RANK() OVER(ORDER BY DataRawLoad DESC) AS rank, * 
            FROM tutorial.original.order_items
          ) AS O
       WHERE
            O.rank = 1
       ;