In [1]:
from pyspark.sql import SparkSession, dataframe
from pyspark.sql.functions import when, col, sum, count, isnan, round
from pyspark.sql.functions import regexp_replace, concat_ws, sha2, rtrim, substring
from pyspark.sql.functions import unix_timestamp, from_unixtime, to_date
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from pyspark.sql import HiveContext

import os
import re

from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import when

spark = SparkSession.builder.master("local[*]")\
    .enableHiveSupport()\
    .getOrCreate()

In [36]:
def salvar_df(df, file):
    output = "/input/projeto_hive/gold/" + file
    erase = "hdfs dfs -rm " + output + "/*"
    rename = "hdfs dfs -get /datalake/gold/"+file+"/part-* /input/projeto_hive/gold/"+file+".csv"
    print(rename)
    
    
    df.coalesce(1).write\
        .format("csv")\
        .option("header", True)\
        .option("delimiter", ";")\
        .mode("overwrite")\
        .save("/datalake/gold/"+file+"/")

    os.system(erase)
    os.system(rename)

In [3]:
df_pedidos = spark.sql("select * from desafio.pedidos")
df_categorias = spark.sql("select * from desafio.categorias")
df_clientes = spark.sql("select * from desafio.clientes")
df_item_pedidos = spark.sql("select * from desafio.item_pedido")

In [4]:
df_pedidos = df_pedidos.select('*', substring('dt_pedido', 1,4).alias('Ano'),substring('dt_pedido', 6,2).alias('Mes'), substring('dt_pedido', 9,2).alias('Dia')).where(df_pedidos.dt_pedido != 'dt_pedido')

In [5]:
df_pedidos.show()

+-----------+--------------------+-----------+----------+---------+-------------+----+---+---+
|  id_pedido|           dt_pedido|id_parceiro|id_cliente|id_filial|vr_total_pago| Ano|Mes|Dia|
+-----------+--------------------+-----------+----------+---------+-------------+----+---+---+
|47585891513|2021-06-23T00:00:...|         13|  33752975|      166|       318.48|2021| 06| 23|
|49524083516|2021-08-03T00:00:...|         16|  43670652|      166|       249.59|2021| 08| 03|
|49130099513|2021-07-26T00:00:...|         13|  47130570|      166|        64.87|2021| 07| 26|
|49579056516|2021-08-04T00:00:...|         16|  16190735|      497|        93.59|2021| 08| 04|
| 4868385806|2021-07-16T00:00:...|          6|  74672887|      497|       311.87|2021| 07| 16|
|49959025513|2021-08-13T00:00:...|         13|  64040325|      497|        64.99|2021| 08| 13|
|49579056513|2021-08-04T00:00:...|         13|  16190735|      497|       376.98|2021| 08| 04|
| 4962532626|2021-08-05T00:00:...|          6|   8

In [4]:
# mostrarem a quantidade de registros de cada tabela
df_pedidos.count()

733472

In [6]:
# mostrar os 10 primeiros registros das tabelas
df_pedidos.show(10)

+-----------+--------------------+-----------+----------+---------+-------------+
|  id_pedido|           dt_pedido|id_parceiro|id_cliente|id_filial|vr_total_pago|
+-----------+--------------------+-----------+----------+---------+-------------+
|  id_pedido|           dt_pedido|id_parceiro|id_cliente|id_filial|vr_total_pago|
|47585891513|2021-06-23T00:00:...|         13|  33752975|      166|       318.48|
|49524083516|2021-08-03T00:00:...|         16|  43670652|      166|       249.59|
|49130099513|2021-07-26T00:00:...|         13|  47130570|      166|        64.87|
|49579056516|2021-08-04T00:00:...|         16|  16190735|      497|        93.59|
| 4868385806|2021-07-16T00:00:...|          6|  74672887|      497|       311.87|
|49959025513|2021-08-13T00:00:...|         13|  64040325|      497|        64.99|
|49579056513|2021-08-04T00:00:...|         13|  16190735|      497|       376.98|
| 4962532626|2021-08-05T00:00:...|          6|   8571960|      497|       175.37|
| 4976142355|202

In [10]:
# valor total dos pedidos
df_pedidos.agg(sum('vr_total_pago')).collect()

[Row(sum(vr_total_pago)=138499099.90943256)]

In [6]:
# Tabela de Stage
df = df_pedidos.join(df_item_pedidos,df_pedidos.id_pedido == df_item_pedidos.id_pedido,"inner")

In [4]:
df.select

+-----------+--------------------+-----------+----------+---------+-------------+-----------+----------+----------+-----------+
|  id_pedido|           dt_pedido|id_parceiro|id_cliente|id_filial|vr_total_pago|  id_pedido|id_produto|quantidade|vr_unitario|
+-----------+--------------------+-----------+----------+---------+-------------+-----------+----------+----------+-----------+
|46626099016|2021-06-02T00:00:...|         16| 138633690|      547|       194.99|46626099016|   5077094|         1|     194.99|
|46630423213|2021-06-02T00:00:...|         13| 106568537|      875|        51.99|46630423213|   3335368|         1|      51.99|
|46630859716|2021-06-02T00:00:...|         16| 138757662|      884|        90.99|46630859716|   5053698|         1|      90.99|
|46631152013|2021-06-02T00:00:...|         13| 138459547|      494|       129.99|46631152013|   3488193|         1|     129.99|
|46635194213|2021-06-02T00:00:...|         13|   5534295|      194|        90.99|46635194213|   4149634|

In [7]:
df_stage = df.join(df_clientes,df_pedidos.id_cliente == df_clientes.id_cliente,"left")

In [18]:
df_stage.show(10)

+-----------+--------------------+-----------+----------+---------+-------------+----+---+---+-----------+----------+----------+-----------+----------+--------------------+---------+
|  id_pedido|           dt_pedido|id_parceiro|id_cliente|id_filial|vr_total_pago| Ano|Mes|Dia|  id_pedido|id_produto|quantidade|vr_unitario|id_cliente|          nm_cliente|flag_ouro|
+-----------+--------------------+-----------+----------+---------+-------------+----+---+---+-----------+----------+----------+-----------+----------+--------------------+---------+
|46633620016|2021-06-02T00:00:...|         16|  10144730|      229|       129.99|2021| 06| 02|46633620016|    910023|         1|     129.99|  10144730|Cliente Magalu - ...|        0|
|46634743216|2021-06-02T00:00:...|         16| 127381525|        3|       207.99|2021| 06| 02|46634743216|   2873607|         1|     207.99| 127381525|Cliente Magalu - ...|        0|
|46624978716|2021-06-02T00:00:...|         16| 137461790|      231|        38.99|2021

In [8]:
df_pedidos.createOrReplaceTempView("pedidos")
df_item_pedidos.createOrReplaceTempView("item_pedido")
df_clientes.createOrReplaceTempView("cliente")

In [16]:
df_item_pedidos.printSchema()

root
 |-- id_pedido: string (nullable = true)
 |-- id_produto: string (nullable = true)
 |-- quantidade: string (nullable = true)
 |-- vr_unitario: string (nullable = true)



In [20]:
df_stage_final = spark.sql(
    '''Select p.*, ip.quantidade, ip.vr_unitario, ip.id_produto,  c.nm_cliente, c.flag_ouro from pedidos p
       inner join item_pedido ip on p.id_pedido = ip.id_pedido
       left join cliente c
       on p.id_cliente = c.id_cliente
    '''
)

In [21]:
df_stage_final.show(10, truncate=False)

+-----------+------------------------+-----------+----------+---------+-------------+----+---+---+----------+-----------+----------+---------------------------+---------+
|id_pedido  |dt_pedido               |id_parceiro|id_cliente|id_filial|vr_total_pago|Ano |Mes|Dia|quantidade|vr_unitario|id_produto|nm_cliente                 |flag_ouro|
+-----------+------------------------+-----------+----------+---------+-------------+----+---+---+----------+-----------+----------+---------------------------+---------+
|49523668016|2021-08-03T00:00:00.000Z|16         |100041455 |231      |64.99        |2021|08 |03 |1         |64.99      |2548610   |Cliente Magalu - 0100041455|0        |
|48347149016|2021-07-09T00:00:00.000Z|16         |100194305 |3        |55.89        |2021|07 |09 |1         |55.89      |1207789   |Cliente Magalu - 0100194305|0        |
|47109143513|2021-06-12T00:00:00.000Z|13         |100220472 |94       |155.99       |2021|06 |12 |1         |155.99     |4538653   |Cliente Magal

In [22]:
df_stage_final = df_stage_final.withColumn('PK_CLIENTE', sha2(concat_ws("",df_stage_final.id_cliente, df_stage_final.nm_cliente,df_stage_final.flag_ouro), 256))

In [24]:
df_stage_final = df_stage_final.withColumn('PK_CALENDARIO', sha2(concat_ws("",df_stage_final.dt_pedido, df_stage_final.Ano,df_stage_final.Mes,df_stage_final.Dia), 256))

In [25]:
df_stage_final.show(5, truncate=False)

+-----------+------------------------+-----------+----------+---------+-------------+----+---+---+----------+-----------+----------+---------------------------+---------+----------------------------------------------------------------+----------------------------------------------------------------+
|id_pedido  |dt_pedido               |id_parceiro|id_cliente|id_filial|vr_total_pago|Ano |Mes|Dia|quantidade|vr_unitario|id_produto|nm_cliente                 |flag_ouro|PK_CLIENTE                                                      |PK_CALENDARIO                                                   |
+-----------+------------------------+-----------+----------+---------+-------------+----+---+---+----------+-----------+----------+---------------------------+---------+----------------------------------------------------------------+----------------------------------------------------------------+
|49523668016|2021-08-03T00:00:00.000Z|16         |100041455 |231      |64.99        |2021|08 |03 

In [26]:
df_stage_final.createOrReplaceTempView("stage")

In [28]:
spark.sql("select * from stage").show(2, truncate=False)

+-----------+------------------------+-----------+----------+---------+-------------+----+---+---+----------+-----------+----------+---------------------------+---------+----------------------------------------------------------------+----------------------------------------------------------------+
|id_pedido  |dt_pedido               |id_parceiro|id_cliente|id_filial|vr_total_pago|Ano |Mes|Dia|quantidade|vr_unitario|id_produto|nm_cliente                 |flag_ouro|PK_CLIENTE                                                      |PK_CALENDARIO                                                   |
+-----------+------------------------+-----------+----------+---------+-------------+----+---+---+----------+-----------+----------+---------------------------+---------+----------------------------------------------------------------+----------------------------------------------------------------+
|49523668016|2021-08-03T00:00:00.000Z|16         |100041455 |231      |64.99        |2021|08 |03 

In [32]:
ft_pedidos = spark.sql("SELECT PK_CLIENTE, PK_CALENDARIO, COUNT(id_pedido) AS QUANTIDADE, SUM(vr_total_pago) as VALOR_TOTAL from stage group by PK_CLIENTE, PK_CALENDARIO")

In [39]:
df_clientes = spark.sql("SELECT DISTINCT PK_CLIENTE, nm_cliente, id_cliente, flag_ouro FROM STAGE")

In [41]:
df_calendario = spark.sql("SELECT DISTINCT PK_CALENDARIO, dt_pedido, Ano, Mes, Dia FROM STAGE")

In [34]:
ft_pedidos.show(5, truncate=False)

+----------------------------------------------------------------+----------------------------------------------------------------+----------+-----------+
|PK_CLIENTE                                                      |PK_CALENDARIO                                                   |QUANTIDADE|VALOR_TOTAL|
+----------------------------------------------------------------+----------------------------------------------------------------+----------+-----------+
|9a350c8b7b2edad890ceb91de10b524ec536b9930a53f26b5e7c8d8340cf6b8a|9575b34955f33d58677d4369c6db4087457ba380fe9adecda4eb503803f3cfeb|1         |207.94     |
|2857e1f286d99a76d8ecd447205197301766e93102809bc687f3280f947c06da|62e2135b5913bec7b997d1e8a19df56ab0296f1e0b4a10eba860c48dc337c3d2|1         |155.99     |
|8b9f8bda6b1889fb7982ce8695d18d3d2f242f2f0a45b6e910fc7347f8d4f699|47028c02fbfb23f39a192eb5d47ccabd7aef38c45bae73e14a52268c0249c96d|1         |194.99     |
|ed11aeeb8ea790a44e5c59d2661189b503166d942a9d6d51d3af7e62a4c34916|dbc9

In [37]:
# mostrar natela a estrutura dos dataframes
salvar_df(ft_pedidos, 'ft_pedidos')

hdfs dfs -get /datalake/gold/ft_pedidos/part-* /input/projeto_hive/gold/ft_pedidos.csv


In [43]:
salvar_df(df_clientes, 'dim_clientes')

hdfs dfs -get /datalake/gold/dim_clientes/part-* /input/projeto_hive/gold/dim_clientes.csv


In [42]:
salvar_df(df_calendario, 'dim_calendario')

hdfs dfs -get /datalake/gold/dim_calendario/part-* /input/projeto_hive/gold/dim_calendario.csv
