In [0]:
from pyspark.sql import SparkSession, functions as F, types as T, Window
from delta.tables import DeltaTable
import os

In [0]:
from pyspark.sql import types as T

catalog_table = "production.refined.d_servicos"

if not spark.catalog.tableExists(catalog_table):
    schema = T.StructType([
        T.StructField("pk_servico", T.StringType(), True),
        T.StructField("sk_servico", T.LongType(), True),
        T.StructField("tipo_servico", T.StringType(), True),
        T.StructField("descricao", T.StringType(), True),
        T.StructField("start_date", T.DateType(), True),
        T.StructField("update_date", T.DateType(), True)
    ])

    df_empty = spark.createDataFrame([], schema)

    # Cria tabela Delta no catálogo
    df_empty.write.format("delta").saveAsTable(catalog_table)

    print("Tabela Delta criada com sucesso em:", catalog_table)
else:
    print("Tabela já existe:", catalog_table)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# --- Nome da tabela no catálogo ---
catalog_table = "production.refined.d_servicos"

# --- Cria tabela Delta vazia se não existir ---
if not spark.catalog.tableExists(catalog_table):
    schema = T.StructType([
        T.StructField("pk_servico", T.StringType(), True),
        T.StructField("sk_servico", T.LongType(), True),
        T.StructField("tipo_servico", T.StringType(), True),
        T.StructField("descricao", T.StringType(), True),
        T.StructField("start_date", T.DateType(), True),
        T.StructField("update_date", T.DateType(), True)
    ])
    df_empty = spark.createDataFrame([], schema)
    df_empty.write.format("delta").saveAsTable(catalog_table)
    print("Tabela Delta criada com sucesso:", catalog_table)
else:
    print("Tabela já existe:", catalog_table)

# --- Carrega a dimensão existente ---
delta_table = DeltaTable.forName(spark, catalog_table)
df_dim_existente = delta_table.toDF()

# --- Consome os dados de entrada ---
df_consumos = (
    spark.read.table("production.raw.tb_consumos")
    .select("tipo_servico", "descricao")
    .dropDuplicates()
    .withColumn(
        "pk_servico",
        F.sha2(F.concat_ws("||", F.col("tipo_servico"), F.col("descricao")), 256)
    )
)

# --- Último surrogate key (sk_servico) ---
last_id = df_dim_existente.agg(F.max("sk_servico")).collect()[0][0]
if last_id is None:
    last_id = 0

# --- Prepara registros de entrada com nova surrogate key ---
window = Window.orderBy("tipo_servico", "descricao")
df_prepared = (
    df_consumos.withColumn(
        "sk_servico", (F.row_number().over(window) + last_id).cast(T.LongType())
    )
    .withColumn("start_date", F.current_date())
    .withColumn("update_date", F.lit(None).cast(T.DateType()))
)

# --- Merge/upsert na dimensão ---
delta_table.alias("target").merge(
    df_prepared.alias("source"),
    "target.pk_servico = source.pk_servico"
).whenMatchedUpdate(
    set={
        "tipo_servico": F.col("source.tipo_servico"),
        "descricao": F.col("source.descricao"),
        "update_date": F.current_date()
    }
).whenNotMatchedInsert(
    values={
        "pk_servico": F.col("source.pk_servico"),
        "sk_servico": F.col("source.sk_servico"),
        "tipo_servico": F.col("source.tipo_servico"),
        "descricao": F.col("source.descricao"),
        "start_date": F.col("source.start_date"),
        "update_date": F.col("source.update_date")
    }
).execute()

print("Merge/upsert concluído na tabela:", catalog_table)


In [0]:
%sql
select * from production.raw.tb_consumos
