In [0]:
import delta

def table_exists(database, table):
    count = (spark.sql(f"SHOW TABLES FROM {database}")
                  .filter(f"database='{database}' AND tableName='{table}'")
                  .count())    
    return count == 1

In [0]:
schema = "bronze"
tablename = "customers"
id_field = "idCliente"
timestamp_field = "DtAtualizacao"

# tablename = dbutils.widgets.get("tablename")
# id_field = dbutils.widgets.get("id_field")
# timestamp_field = dbutils.widgets.get("timestamp_field")

In [0]:
# O arquivo CSV não possui schea, então foi passado alguns parâmetros para ele definir. 
# Já com arquivos Parquet,  naturalmente já vem o schema inferido pois possui metadados.
df_full = spark.read.format("csv").options(sep=";", header=True).load(f"/Volumes/workspace/upsell/full_load/{tablename}/")
schema = df_full.schema

In [0]:
if not table_exists(schema, tablename):
    print("Tabela não existente, criando tabela...")
    df_full = spark.read.format("csv").options(sep=";", header=True).load(f"/Volumes/workspace/upsell/full_load/{tablename}/")
    (df_full.coalesce(1).write.format("delta").mode("overwrite").saveAsTable(f"{schema}.{tablename}"))
else:
    print("Tabela já existente, ignorando a carga completa.")

### Atualização da tabela - ReadStream

In [0]:
bronze = delta.DeltaTable.forName(spark, f"{schema}.{tablename}")

# Dataframe que realiza a leitura dos dados no formato stream.
df_stream = (spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "parquet")
  .schema(schema)
  .load(f"/Volumes/workspace/upsell/cdc/{tablename}/"))

stream = (df_stream.writeStream
          .option("checkpointLocation", f"/Volumes/workspace/upsell/cdc/{tablename}_checkpoint/")
          .foreachBatch()
        )


def upsert(df, deltatable):
  query = f'''
      SELECT * 
      FROM view_{tablename}
      QUALIFY ROW_NUMBER() OVER (PARTITION BY {id_field} ORDER BY {timestamp_field} DESC) = 1
  '''

  df_cdc = spark.sql(query)

  (deltatable.alias("b")
        .merge(df_cdc.alias("d"), f"b.{id_field} = d.{id_field}")
        .whenMatchedDelete(condition = "d.OP = 'D'")
        .whenMatchedUpdateAll(condition = "d.OP = 'U'")
        .whenNotMatchedInsertAll(condition = "d.OP = 'I' or d.OP = 'U'")
        #.execute()
  )