# Bibliotecas

In [1]:
from pyspark.sql import SparkSession
from delta import *
import os
from delta.tables import DeltaTable

# Sessão SPARK

In [2]:
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .config("spark.jars.packages", 
                    "org.apache.hadoop:hadoop-aws:3.2.2,"
                    "io.delta:delta-spark_2.12:3.2.0,"
                    "io.delta:delta-storage:3.2.0,"
                    "com.amazonaws:aws-java-sdk-bundle:1.12.180") \
            .config("spark.executor.memory", "4g") \
            .config("spark.executor.cores", "2") \
            .config("spark.driver.memory", "4g") \
            .config("spark.driver.cores", "2") \
            .config("spark.dynamicAllocation.enabled", "true") \
            .config("spark.dynamicAllocation.minExecutors", "1") \
            .config("spark.dynamicAllocation.maxExecutors", "2") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
	        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_HOST")) \
            .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY")) \
            .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY")) \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.com.amazonaws.services.s3.enableV4", "true") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("spark.hadoop.fs.AbstractFileSystem.s3a.impl","org.apache.hadoop.fs.s3a.S3A") \
            .getOrCreate()    

In [3]:
emp_path = "s3a://silver/employe"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS employe (
    emp_id INT,
    emp_name STRING,
    dept_code STRING,
    salary DOUBLE
) USING DELTA LOCATION '{emp_path}'
""")

spark.sql("""
INSERT INTO employe (emp_id, emp_name, dept_code, salary)
VALUES
    (1001, 'Alice', 'D101', 50000),
    (1002, 'Bob', 'D102', 60000),
    (1003, 'Charlie', 'D103', 70000)
""")

DataFrame[]

# Tabela Source

In [3]:
#lendo tabela
emp_path = "s3a://bronze/employe"
spark.read.format("delta").load(emp_path).createOrReplaceTempView("employe")

In [13]:
spark.sql("""
select * from employe
""").show()

+------+--------+---------+-------+
|emp_id|emp_name|dept_code| salary|
+------+--------+---------+-------+
|  1002|     Bob|     D102|60000.0|
|  1003| Charlie|     D103|70000.0|
|  1007|   Grace|     D107|75000.0|
|  1006|   Frank|     D106|70000.0|
|  1002|     Bob|     D102|62000.0|
+------+--------+---------+-------+



In [14]:
spark.sql("""
INSERT INTO employe (emp_id, emp_name, dept_code, salary)
VALUES
    (1001, 'Alice', 'D101', 50000),
    (1002, 'Bob', 'D102', 60000),
    (1003, 'Charlie', 'D103', 70000)
""")

DataFrame[]

In [12]:
spark.sql("""
DELETE FROM employe WHERE emp_id = '1001'
""")

DataFrame[num_affected_rows: bigint]

# Tabela Target

In [4]:
emp_path = "s3a://silver/employe_scd2"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS employe_scd2 (
    emp_id INT,
    emp_name STRING,
    dept_code STRING,
    salary DOUBLE,
    start_date DATE,
    end_date DATE,
    is_current BOOLEAN
) USING DELTA LOCATION '{emp_path}'
""")

spark.sql("""
INSERT INTO employe_scd2 (emp_id, emp_name, dept_code, salary, start_date, end_date, is_current)
VALUES
    (1001, 'Alice', 'D101', 50000, '2020-01-01', NULL, TRUE),
    (1002, 'Bob', 'D102', 60000, '2020-01-01', NULL, TRUE),
    (1003, 'Charlie', 'D103', 70000, '2020-01-01', NULL, TRUE)
""")

DataFrame[]