
# Pipeline From Bronze to Silver

Apply initial transformations and handlings to data, in order to grant further usability.

It relies on `.sql` files with placeholders for dates to collect and treat data from bronze.

Then, a logic of idempotent insertion is used to grant safe data on silver layer.



## Required Libraries

In [0]:
import pytz
from pathlib import Path
from datetime import datetime
from pyspark.sql.functions import lit
from pyspark.sql.connect.dataframe import DataFrame

from src.common import (
    get_tz,
    add_reference_date_parameters,
    get_reference_dates,
    read_sql_template,
    table_exists, 
    use_schema_and_create_if_not_exists,
)


In [0]:

CATALOG = "precos_pmc"
SCHEMA = "silver"
tz = get_tz()



## Define Unity Catalog and add date widgets


In [0]:

add_reference_date_parameters(dbutils.widgets)
use_schema_and_create_if_not_exists(spark, catalog=CATALOG, schema=SCHEMA)



## Collect Dates




In [0]:

dates = get_reference_dates(dbutils.widgets)
print('Running for:', [dt.strftime('%Y-%m-%d') for dt in dates])



## Silver writing logic


In [0]:

def write_silver_delta_table_on_s3(
    spark,
    df: DataFrame, 
    str_list_of_quoted_date_references: str, 
    table_name: str
) -> bool:

    if df.isEmpty():
        print(f"Nothing to insert in {table_name}")
        return False

    full_table_name = f"{CATALOG}.{SCHEMA}.{table_name}"

    try:
        
        # Create table
        if not table_exists(spark, CATALOG, SCHEMA, table_name):
            print(f"Creating table {full_table_name}")
            (
                df
                .writeTo(full_table_name)
                .using("delta")
                .partitionedBy("dt_referencia")
                .create()
            )
    
        # Grants idempotency replaces
        else:

            print(f"Replacing partition(s) {str_list_of_quoted_date_references} in {full_table_name}")

            spark.sql(f"""
                DELETE FROM {full_table_name} 
                WHERE dt_referencia IN ({str_list_of_quoted_date_references})
            """)        
            
            (
                df
                .writeTo(full_table_name)
                .append()
            )

        print(f'Sucessful inserted into {full_table_name}')
        return True
    
    except Exception as e:
        print(f'Error while writing {SCHEMA} delta ({table_name}): {e}')
        return False



## Execution flow


In [0]:

src_path = Path('src')
str_list_of_quoted_date_references = str([dt.strftime('%Y-%m-%d') for dt in dates]).replace('[', '').replace(']', '')


sql_cotacoes = read_sql_template(src_path / 'bronze_to_silver_cotacoes.sql', list_of_quoted_date_references=str_list_of_quoted_date_references)
df_cotacoes = spark.sql(sql_cotacoes)

sql_base_incremental = read_sql_template(src_path / 'bronze_to_silver_base_incremental.sql', list_of_quoted_date_references=str_list_of_quoted_date_references)
df_base_incremental = spark.sql(sql_base_incremental)

df = df_cotacoes

if not df_base_incremental.isEmpty() and not df.isEmpty():
    print('merging')
    df = df_cotacoes.unionByName(df_base_incremental)

if not df_base_incremental.isEmpty() and df.isEmpty():
    print('incremental', df_base_incremental.count())
    df = df_base_incremental

write_silver_delta_table_on_s3(spark, df=df, str_list_of_quoted_date_references=str_list_of_quoted_date_references, table_name='cotacoes')
