
# Pipeline From Silver to Gold

The gold layer contains the data already considering the domain rules applied.

Therefore, it relies on `.sql` file queries to build those informations. 

These queries build a star-schema with the most recent data on gold.


## Required Libraries

In [0]:
from pathlib import Path
from pyspark.sql.connect.dataframe import DataFrame

from src.common import (
    read_sql_template,
    table_exists, 
    use_schema_and_create_if_not_exists,
)



## Define Unity Catalog

In [0]:

CATALOG = "precos_pmc"
SCHEMA = "gold"



In [0]:
use_schema_and_create_if_not_exists(spark, catalog=CATALOG, schema=SCHEMA)



## Gold Insertion Logic


In [0]:
def write_gold_snapshot_star_schema_delta_table_on_s3(
    spark,
    df: DataFrame, 
    table_name: str,
    partitions: list = []
) -> bool:

    if not partitions:
        partitions = ["dt_ultima_atualizacao"]

    if df.isEmpty():
        print(f"Nothing to insert in {table_name}")
        return False

    full_table_name = f"{CATALOG}.{SCHEMA}.{table_name}"

    try:
        
        # Drop old table
        if table_exists(spark, CATALOG, SCHEMA, table_name):
            drop_table = f"DROP TABLE IF EXISTS {full_table_name}"
            spark.sql(drop_table)

        # Create table
        print(f"Creating snapshot {full_table_name}")
        (
            df
            .writeTo(full_table_name)
            .using("delta")
            .partitionedBy(*partitions)
            .create()
        )

        # Insert results
        print(f"Filling snapshot in {full_table_name}")
        (
            df
            .writeTo(full_table_name)
            .append()
        )

        print(f'Sucessful inserted into {full_table_name}')
        return True
    
    except Exception as e:
        print(f'Error while writing {SCHEMA} delta ({table_name}): {e}')
        return False


## Execution Flow

In [0]:

src_folder = Path('src')

# Fact: precos
sql_f_precos = read_sql_template(src_folder / 'silver_to_gold_f_precos.sql')
df_f_precos = spark.sql(sql_f_precos)
write_gold_snapshot_star_schema_delta_table_on_s3(spark, df=df_f_precos, table_name='f_precos')

# Fact: precos completa
sql_f_precos_completa = read_sql_template(src_folder / 'silver_to_gold_f_precos_completa.sql')
df_f_precos_completa = spark.sql(sql_f_precos_completa)
write_gold_snapshot_star_schema_delta_table_on_s3(spark, df=df_f_precos_completa, table_name='f_precos_completa', partitions=['dt_referencia', 'nm_rede', 'nm_bairro'])

# Dimension: produtos
sql_d_produtos = read_sql_template(src_folder / 'silver_to_gold_d_produtos.sql')
df_d_produtos = spark.sql(sql_d_produtos)
write_gold_snapshot_star_schema_delta_table_on_s3(spark, df=df_d_produtos, table_name='d_produtos')

# Dimension: empresas
sql_d_empresas = read_sql_template(src_folder / 'silver_to_gold_d_empresas.sql')
df_d_empresas = spark.sql(sql_d_empresas)
write_gold_snapshot_star_schema_delta_table_on_s3(spark, df=df_d_empresas, table_name='d_empresas')

