In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from datetime import datetime
from pyspark.sql.functions import current_date, lit

In [0]:
df_hoteis = (
    spark.read.table("production.raw.raw_tb_hoteis")
    .select(
        F.col("hotel_id"),
        F.col("nome_hotel"),
        F.col("endereco"),
        F.col("cidade"),
        F.col("estado"),
        F.col("cep"),
        F.col("pais"),
        F.col("estrelas"),
        F.col("numero_quartos"),
        F.col("comodidades"),
        F.col("data_abertura"),
        F.col("telefone"),
        F.col("email"),
        F.col("gerente")
    )
    .withColumn("data_abertura", F.to_date(F.col("data_abertura")))
    .withColumn("start_date", F.current_date())
    .withColumn("update_date", F.lit(None).cast("date"))
)

display(df_hoteis.limit(5))

In [0]:
df_geolocalizacao = (
    spark.read.table("production.raw.raw_geolocalizacao_ceps")
    .select(
        F.col("cep"),
        F.col("latitude"),
        F.col("longitude")
    )
)
display(df_geolocalizacao.limit(5))

In [0]:
df_hoteis = (
    df_hoteis.join(df_geolocalizacao, on="cep", how="left")
)

In [0]:
catalog_table = "production.trusted.tb_hoteis"

if spark.catalog.tableExists(catalog_table):
    delta_table = DeltaTable.forName(spark, catalog_table)
    
    # Merge (upsert) com atualização da update_date
    delta_table.alias("target").merge(
        df_hoteis.alias("source"),
        "target.hotel_id = source.hotel_id"
    ).whenMatchedUpdate(
        set = {
            **{col: f"source.{col}" for col in df_hoteis.columns if col != "update_date"},
            "update_date": "current_date()"
        }
    ).whenNotMatchedInsertAll() \
     .execute()
else:
    # Se não existir, cria a tabela Delta incluindo a update_date
    (
        df_hoteis
        .withColumn("update_date", lit(""))
        .write
        .format("delta")
        .saveAsTable(catalog_table)
    )