### <font color='blue'>Automatizando o Pipeline de Consolidação, Limpeza e Enriquecimento de Dados no Databricks</font>

## Tabela Silver de Transações

In [None]:
# Define o banco de dados que será usado
spark.sql('use database dsa_db_02_staging')

In [None]:
# Cria a tabela Silver
spark.sql("""
          create table if not exists dsa_silver_transacoes(
            transaction_id string,
            customer_id integer,
            product_id integer,
            quantity integer,
            total_amount double,
            order_status string,
            transaction_date string,
            payment_method string,
            store_type string,
            last_updated_at timestamp
          )""")

In [None]:
# Extrai a última data de atualização (a maior data)
last_updated_df = spark.sql('select max(last_updated_at) as last_completed from dsa_silver_transacoes')

In [None]:
# Extrai a data de atualização
last_updated_time = last_updated_df.collect()[0]['last_completed']

In [None]:
# Se não houver data de atualização, considera a data de 1900-01-01
if last_updated_time is None:
    last_updated_time = '1900-01-01T00:00:00.000+00:00'

In [None]:
# Vamos criar uma view temporária a fim de verificar que o registro recebido na tabela bronze foi ou não processado para a tabela silver
spark.sql(f"""
          create or replace temporary view transacoes_incremental as
          select * from dsa_db_01_inicial.dsa_bronze_transacoes as c where c.recebido_em  > '{last_updated_time}' """)

In [None]:
# Select
spark.sql("select * from transacoes_incremental limit 10").show()

In [None]:
# Select count
spark.sql("select count(*) from transacoes_incremental").show()

In [None]:
# Vamos criar uma view temporária onde id de transação, de cliente e de produto não seja nulo e a data de transação menor que a data corrente
spark.sql("""
          create or replace temporary view vw_silver_transacoes_incremental as
          select 
          transaction_id,
          customer_id,
          product_id,
          case 
            when quantity < 0 then 0
            else quantity
          end as quantity,
          case 
            when total_amount < 0 then 0
            else total_amount
          end as total_amount,
          case
            when transaction_date is not null then to_date(transaction_date, 'yyyy-MM-dd')
            else null
          end as transaction_date,
          case 
            when total_amount = 0 or quantity = 0 then 'Cancelled'
            else 'Completed'
          end as order_status,
          case 
            when payment_method is not null then trim(payment_method)
            else 'Unknown'
          end as payment_method,
          case 
            when store_type is not null then initcap(trim(store_type))
            else 'Unknown'
          end as store_type,
          current_timestamp() as last_updated_at
          from transacoes_incremental 
          where transaction_id is not null and 
          customer_id is not null and 
          product_id is not null and to_date(transaction_date, 'yyyy-MM-dd') <= current_date()
          """)

In [None]:
# Se a transação já existir, atualizamos, se não inserimos.
spark.sql(""" 
          merge into dsa_silver_transacoes target
          using vw_silver_transacoes_incremental source
          on source.transaction_id = target.transaction_id
          when matched then
          update set *
          when not matched then 
          insert *""")

In [None]:
# Contagem de registros
spark.sql("select count(*) from dsa_silver_transacoes").show()

# Fim