In [0]:
%run ../../../utils

## Parametros y rutas

In [0]:
# Ruta base en tu Data Lake
silver_table_name = "silver.sales_creditcard"

## Lectura de Bronze

In [0]:
df_bronze = (
    spark.read.table("bronze.sales_creditcard ")
)

## Casting y estandarización de tipos

In [0]:
df_cast = (
    df_bronze
      .withColumn("CreditCardID",   col("CreditCardID").cast("bigint"))
      .withColumn("CardType",       col("CardType").cast("string"))
      .withColumn("CardNumber",     col("CardNumber").cast("string"))
      .withColumn("ExpMonth",       col("ExpMonth").cast("tinyint"))
      .withColumn("ExpYear",        col("ExpYear").cast("smallint"))
      .withColumn("ModifiedDate",   col("ModifiedDate").cast("timestamp"))
)

## Limpieza básica

In [0]:
# Filtrar claves vacías
df_clean = df_cast.filter(
    col("CreditCardID").isNotNull() &
    col("CardNumber").isNotNull() &
    col("CardType").isNotNull()
)

# Eliminar duplicados naturales
df_nodup = df_clean.dropDuplicates(["CreditCardID"])

df_result= (
    df_nodup
      .withColumn("FechaAuditoriaCreacion", current_timestamp())
      .withColumn("FechaAuditoriaModificacion", current_timestamp())
)

## Merge incremental

In [0]:
merge(silver_table_name, df_result, ["CreditCardID"])

## IngestionLog

In [0]:
dict_metrics = get_metrics(silver_table_name)
print(dict_metrics)

In [0]:
job_id = dbutils.widgets.get('JobId')
job_run_id = dbutils.widgets.get('JobRunId')
task_run_id = dbutils.widgets.get('TaskRunId')
table = silver_table_name.split('.')[1]
layer = silver_table_name.split('.')[0]
start_time = datetime.strptime(dbutils.widgets.get('StartTime'), '%Y-%m-%dT%H:%M:%S.%f')
end_time = datetime.now()
duration_seconds = int((end_time - start_time).total_seconds())
rows_in = 0 if layer != 'bronze' else dict_metrics.get('numTargetRowsInserted', 0)
rows_inserted = dict_metrics.get('numTargetRowsInserted', 0)
rows_updated = dict_metrics.get('numTargetRowsUpdated', 0)
rows_deleted = dict_metrics.get('numTargetRowsDeleted', 0)
file_bytes = dict_metrics.get('numTargetBytesAdded', 0)
execution_time = dict_metrics.get('executionTimeMs', 0) / 1000

print(f"""
  jobId: {job_id}
  jobRunId: {job_run_id}
  taskRunId: {task_run_id}
  taskStartTime: {start_time}
  taskEndtime: {end_time}
  taskDurationSeconds: {duration_seconds}
  tableName: {table}
  layer : {layer}
  rowsIn: {rows_in}
  rowsInserted: {rows_inserted}
  rowsUpdated: {rows_updated}
  rowsDropped: {rows_deleted}
  fileBytes: {file_bytes}
  executionTime: {execution_time}
  """)

In [0]:
# dbutils.jobs.taskValues.set(key = "JobId", value = JobId)
# dbutils.jobs.taskValues.set(key = "JobRunId", value = JobRunId)