In [0]:
%run ../../../utils

## Parametros y rutas

In [0]:
# Ruta base en tu Data Lake
silver_table_name = "silver.sales_countryregioncurrency"

## Lectura de Bronze

In [0]:
df_bronze = (
    spark.read.table("bronze.sales_countryregioncurrency ")
)

## Casting y estandarización de tipos

In [0]:
df_cast = (
    df_bronze
      .withColumn("CountryRegionCode", col("CountryRegionCode").cast("string"))
      .withColumn("CurrencyCode", col("CurrencyCode").cast("string"))
      .withColumn("ModifiedDate", col("ModifiedDate").cast("timestamp"))
)

## Limpieza básica

In [0]:
# Filtrar claves vacías
df_clean = df_cast.filter(
    col("CountryRegionCode").isNotNull() &
    col("CurrencyCode").isNotNull()
)

# Eliminar duplicados naturales
df_final = df_clean.dropDuplicates(["CountryRegionCode", "CurrencyCode"])

df_result= (
    df_final
      .withColumn("FechaAuditoriaCreacion", current_timestamp())
      .withColumn("FechaAuditoriaModificacion", current_timestamp())
)

## Escritura incrememental con MERGE

In [0]:
# query que obtiene las primary key de la tabla silver
query = f"""
SELECT
  cu.column_name
FROM system.information_schema.key_column_usage AS cu
INNER JOIN system.information_schema.table_constraints AS tc
  USING (constraint_catalog, constraint_schema, constraint_name)
WHERE concat_ws(".",cu.table_schema, cu.table_name) == '{silver_table_name}'
  AND tc.constraint_type = 'PRIMARY KEY'
  AND cu.table_catalog = 'lakehouse'
ORDER BY ordinal_position
"""

# ejecuta query y llevarlo a dataframe
df_query = (
    spark.sql(query)
)

# recorrido del resultado y llevarlo a lista
columns_key = [row['column_name'] for row in df_query.collect()]

# construye merge conditions
merge_conditions = " AND ".join([f"m.{c} = in.{c}" for c in columns_key])

In [0]:
# obtener la tabla delta
delta_table = DeltaTable.forName(spark, silver_table_name)

# construir diccionarios para update e insert
exclusion_list = set(columns_key + ["FechaAuditoriaCreacion", "FechaAuditoriaModificacion"])

columns_to_update = {
    col_name: f"in.{col_name}"
    for col_name in delta_table.toDF().columns
    if col_name not in exclusion_list
}

columns_to_insert = {
    col_name: f"in.{col_name}"
    for col_name in delta_table.toDF().columns
}

In [0]:
# ejecuta operacion merge
df_merge = (
    delta_table.alias("m")
        .merge(
            df_result.alias("in"),
            merge_conditions
        )
        .whenMatchedUpdate(set=columns_to_update)
        .whenNotMatchedInsert(values=columns_to_insert)
        .execute()
)