In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd

In [2]:
builder = (
    SparkSession.builder.appName("DataWarehouse-ETL")
    # Memory configurations for ETL processing
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    # Jars for Delta Lake, S3, and MySQL
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.1,com.amazonaws:aws-java-sdk-bundle:1.12.262," \
    "io.delta:delta-spark_2.13:4.0.0," \
    "com.mysql:mysql-connector-j:8.0.33")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # MinIO (S3A) - Source
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9900")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # S3A performance configs
    .config("spark.hadoop.fs.s3a.connection.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.request.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "3")
    .config("spark.hadoop.fs.s3a.retry.limit", "3")
)

spark = builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/30 12:14:28 WARN Utils: Your hostname, bnguyen-lenovo, resolves to a loopback address: 127.0.1.1; using 10.130.164.154 instead (on interface wlp2s0)
25/09/30 12:14:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/bnguyen/Desktop/finance_analytics/venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/bnguyen/.ivy2.5.2/cache
The jars for the packages stored in: /home/bnguyen/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
io.delta#delta-spark_2.13 added as a dependency
com.mysql#mysql-connector-j added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ec73cf26-575d-416e-b7e3-fcaf872bd91b;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.4

In [44]:
# Check the test table where CDC streaming is writing
df_test = spark.read.format("delta").load("s3a://rootdb/transactions_fixed")
df_test.filter(df_test.transaction_id == 999999999999).show()


+--------------+------------+------------+------------+------+-----------------+-----------+------------+-------------+--------------+-----+--------------------+
|transaction_id|  trans_date|   client_id|     card_id|amount|         use_chip|merchant_id|         mcc|merchant_city|merchant_state|  zip|              errors|
+--------------+------------+------------+------------+------+-----------------+-----------+------------+-------------+--------------+-----+--------------------+
|  999999999999|946684800000|999999999999|999999999999|  AA==|Dummy Transaction|          0|999999999999|   Dummy City|            XX|00000|Dummy record for ...|
+--------------+------------+------------+------------+------+-----------------+-----------+------------+-------------+--------------+-----+--------------------+

