In [1]:
from pyspark.sql import SparkSession
from delta import *
import os
from delta.tables import DeltaTable

In [2]:
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .config("spark.jars.packages", 
                    "org.apache.hadoop:hadoop-aws:3.2.2,"
                    "io.delta:delta-spark_2.12:3.2.0,"
                    "io.delta:delta-storage:3.2.0,"
                    "com.amazonaws:aws-java-sdk-bundle:1.12.180") \
            .config("spark.executor.memory", "4g") \
            .config("spark.executor.cores", "2") \
            .config("spark.driver.memory", "4g") \
            .config("spark.driver.cores", "2") \
            .config("spark.dynamicAllocation.enabled", "true") \
            .config("spark.dynamicAllocation.minExecutors", "1") \
            .config("spark.dynamicAllocation.maxExecutors", "2") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
	        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_HOST")) \
            .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY")) \
            .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY")) \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.com.amazonaws.services.s3.enableV4", "true") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("spark.hadoop.fs.AbstractFileSystem.s3a.impl","org.apache.hadoop.fs.s3a.S3A") \
            .getOrCreate()    

In [3]:
# Definindo o caminho do MinIO para armazenar os dados no formato Delta
emp_path = "s3a://silver/employe"

# Criando a tabela de destino 'emp' no formato Delta
spark.sql(f"""
CREATE TABLE IF NOT EXISTS employe (
    emp_id INT,
    emp_name STRING,
    dept_code STRING,
    salary DOUBLE
) USING DELTA LOCATION '{emp_path}'
""")


# Inserindo valores fictícios na tabela dev_bronze_emp
spark.sql("""
INSERT INTO employe (emp_id, emp_name, dept_code, salary)
VALUES 
    (1001, 'Alice', 'D101', 55000),
    (1002, 'Bob', 'D102', 60000),
    (1003, 'Charlie', 'D103', 75000),
    (1004, 'David', 'D104', 65000),
    (1005, 'Eve', 'D105', 70000)
""")

DataFrame[]

In [13]:
spark.sql("SELECT * FROM employe").show()

+------+--------+---------+-------+
|emp_id|emp_name|dept_code| salary|
+------+--------+---------+-------+
|  1001|   Alice|     D101| 1000.0|
|  1002|     Bob|     D102|60000.0|
|  1006| Matheus|     D106| 8900.0|
|  1003| Charlie|     D103|75000.0|
|  1004|   David|     D104|65000.0|
|  1005|     Eve|     D105|70000.0|
+------+--------+---------+-------+



In [9]:
# updates_path = "s3a://bronze/employe"
# # Criando a tabela de origem 'updates' no formato Delta
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS employe_update (
#     emp_id INT,
#     emp_name STRING,
#     dept_code STRING,
#     salary DOUBLE
# ) USING DELTA LOCATION '{updates_path}'
# """)

# # Inserindo valores fictícios na tabela dev_bronze_updates
# spark.sql("""
# INSERT INTO employe_update (emp_id, emp_name, dept_code, salary)
# VALUES 
#     (1001, 'Alice', 'D101', 58000),  -- Atualização do salário de Alice
#     (1002, 'Bob', 'D102', 62000),    -- Atualização do salário de Bob
#     (1006, 'Frank', 'D106', 70000),  -- Novo funcionário (não existe na tabela de emp)
#     (1007, 'Grace', 'D107', 75000)   -- Novo funcionário (não existe na tabela de emp)
# """)


updates_path = "s3a://bronze/employe"
# Criando a tabela de origem 'updates' no formato Delta
spark.sql(f"""
CREATE TABLE IF NOT EXISTS employe_update (
    emp_id INT,
    emp_name STRING,
    dept_code STRING,
    salary DOUBLE
) USING DELTA LOCATION '{updates_path}'
""")

# Inserindo valores fictícios na tabela dev_bronze_updates
spark.sql("""
INSERT INTO employe_update (emp_id, emp_name, dept_code, salary)
VALUES 
    (1001, 'Alice', 'D101', 1000)  -- Atualização do salário de Alice
    (1006, 'Matheus', 'D106', 8900) -- Novo funcionário
""")

DataFrame[]

In [10]:
spark.sql("SELECT * FROM employe_update").show()

+------+--------+---------+------+
|emp_id|emp_name|dept_code|salary|
+------+--------+---------+------+
|  1006| Matheus|     D106|8900.0|
|  1001|   Alice|     D101|1000.0|
+------+--------+---------+------+



# MERGE 1

In [11]:
# Realizando o MERGE (Upsert) entre employe e employe_update usando Spark SQL
spark.sql("""
MERGE INTO employe AS e
USING employe_update AS u
ON e.emp_id = u.emp_id
WHEN MATCHED THEN
    UPDATE SET e.salary = u.salary
WHEN NOT MATCHED THEN
    INSERT *
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [12]:
spark.sql("SELECT * FROM employe").show()

+------+--------+---------+-------+
|emp_id|emp_name|dept_code| salary|
+------+--------+---------+-------+
|  1001|   Alice|     D101| 1000.0|
|  1002|     Bob|     D102|60000.0|
|  1006| Matheus|     D106| 8900.0|
|  1003| Charlie|     D103|75000.0|
|  1004|   David|     D104|65000.0|
|  1005|     Eve|     D105|70000.0|
+------+--------+---------+-------+



In [9]:
spark.sql("DESCRIBE HISTORY employe").show(vertical=True, truncate=False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 version             | 2                                                                                                                                                                                                                                                                                                                                

In [10]:
spark.stop()