In [1]:
import os

from pyspark.sql import SparkSession


# Criar uma sessão no spark
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder.appName('Dual Storage Integration Delta')
        .enableHiveSupport()
        .config(
            'spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension'
        )
        .config(
            'spark.sql.catalog.spark_catalog',
            'org.apache.spark.sql.delta.catalog.DeltaCatalog',
        )
        .config(
            'spark.hadoop.fs.s3a.impl',
            'org.apache.hadoop.fs.s3a.S3AFileSystem',
        )
        .config(
            'spark.hadoop.fs.s3minio.impl',
            'org.apache.hadoop.fs.s3a.S3AFileSystem',
        )
        .getOrCreate()
    )
    return spark


spark = create_spark_session()

In [2]:
# Configurações para AWS S3 (leitura)
s3_options = {
    'fs.s3a.access.key': os.getenv('AWS_ACCESS_KEY_ID'),
    'fs.s3a.secret.key': os.getenv('AWS_SECRET_ACCESS_KEY'),
    'fs.s3a.endpoint': os.getenv('AWS_ENDPOINT'),
    'fs.s3a.region': os.getenv('AWS_REGION'),
    'fs.s3a.path.style.access': 'false',
}

# Configurações para MinIO (escrita como Delta)
minio_delta_options = {
    'fs.s3a.access.key': os.getenv('MINIO_ROOT_USER'),
    'fs.s3a.secret.key': os.getenv('MINIO_ROOT_PASSWORD'),
    'fs.s3a.endpoint': os.getenv('MINIO_ENDPOINT'),
    'fs.s3a.path.style.access': 'true',
    'fs.s3a.connection.ssl.enabled': 'false',
}

In [3]:
# Ingerir a tabela 'league'
df_league = (
    spark.read.format('parquet')
    .options(**s3_options)
    .load('s3a://dev-lab-02-us-east-2-landing/soccer/league/')
)

df_league.show()


# Salvar como Delta Lake no MinIO
(
    df_league.write.format('delta')
    .options(**minio_delta_options)
    .mode('overwrite')
    .save('s3a://bucket-bronze-zone/soccer/league/')
)

print('Tabela Delta gravada com sucesso no MinIO!')

+--------------------+---------------------+--------------+----------------------+-----+--------------------+----------+
|     _airbyte_raw_id|_airbyte_extracted_at| _airbyte_meta|_airbyte_generation_id|   id|                name|country_id|
+--------------------+---------------------+--------------+----------------------+-----+--------------------+----------+
|9202d4ef-8836-419...| 2025-05-08 03:17:...|{35826647, []}|                     3|    1|Belgium Jupiler L...|         1|
|e3a53785-5171-442...| 2025-05-08 03:17:...|{35826647, []}|                     3| 1729|England Premier L...|      1729|
|5cd88425-2ebf-40f...| 2025-05-08 03:17:...|{35826647, []}|                     3| 4769|      France Ligue 1|      4769|
|c23131b2-e6ab-4a8...| 2025-05-08 03:17:...|{35826647, []}|                     3| 7809|Germany 1. Bundes...|      7809|
|01dd48c8-f68b-490...| 2025-05-08 03:17:...|{35826647, []}|                     3|10257|       Italy Serie A|     10257|
|0b766c8f-d617-4d4...| 2025-05-0

In [4]:
# Ingerir a tabela 'team' do S3
df_team = (
    spark.read.format('parquet')
    .options(**s3_options)
    .load('s3a://dev-lab-02-us-east-2-landing/soccer/team/')
)

df_team.show()

# Escrever no MinIO como Delta Lake
(
    df_team.write.format('delta')
    .options(**minio_delta_options)
    .mode('overwrite')
    .save('s3a://bucket-bronze-zone/soccer/team/')
)

print("Tabela 'team' gravada como Delta no MinIO!")

+--------------------+---------------------+--------------+----------------------+----+-----------+--------------------+---------------+----------------+
|     _airbyte_raw_id|_airbyte_extracted_at| _airbyte_meta|_airbyte_generation_id|  id|team_api_id|      team_long_name|team_short_name|team_fifa_api_id|
+--------------------+---------------------+--------------+----------------------+----+-----------+--------------------+---------------+----------------+
|8a375502-f6f7-4d6...| 2025-05-08 03:17:...|{35826647, []}|                     3|   1|       9987|            KRC Genk|            GEN|             673|
|adbef74f-106b-415...| 2025-05-08 03:17:...|{35826647, []}|                     3|   2|       9993|        Beerschot AC|            BAC|             675|
|e657504f-87c7-459...| 2025-05-08 03:17:...|{35826647, []}|                     3|   3|      10000|    SV Zulte-Waregem|            ZUL|           15005|
|8484e9ec-4f55-473...| 2025-05-08 03:17:...|{35826647, []}|                 

In [5]:
# Ingerir a tabela 'match'
df_match = (
    spark.read.format('parquet')
    .options(**s3_options)
    .load(
        's3a://dev-lab-02-us-east-2-landing/soccer/Match/'
    )  # AWS: 'Match' maiúsculo
)

# df_match.show()

# Escrever no MinIO como Delta Lake
(
    df_match.write.format('delta')
    .options(**minio_delta_options)
    .mode('overwrite')
    .save('s3a://bucket-bronze-zone/soccer/match/')  # MinIO: 'match' minúsculo
)

print("Tabela 'match' gravada como Delta no MinIO!")

Tabela 'match' gravada como Delta no MinIO!


In [6]:
# Ingerir a tabela 'country'
df_country = (
    spark.read.format('parquet')
    .options(**s3_options)
    .load('s3a://dev-lab-02-us-east-2-landing/soccer/country/')
)

df_country.show()

# Escreva no MinIO como Delta Lake
(
    df_country.write.format('delta')
    .options(**minio_delta_options)
    .mode('overwrite')
    .save('s3a://bucket-bronze-zone/soccer/country/')
)

print("Tabela 'country' gravada como Delta no MinIO!")

# Encerrar a sessão do Spark
spark.stop()

+--------------------+---------------------+--------------+----------------------+-----+-----------+
|     _airbyte_raw_id|_airbyte_extracted_at| _airbyte_meta|_airbyte_generation_id|   id|       name|
+--------------------+---------------------+--------------+----------------------+-----+-----------+
|3997cb97-814b-493...| 2025-05-08 03:17:...|{35826647, []}|                     4|    1|    Belgium|
|9f37e0c6-6aa7-40c...| 2025-05-08 03:17:...|{35826647, []}|                     4| 1729|    England|
|67ddf5ae-ab61-47d...| 2025-05-08 03:17:...|{35826647, []}|                     4| 4769|     France|
|67c03fff-a286-4e8...| 2025-05-08 03:17:...|{35826647, []}|                     4| 7809|    Germany|
|f0cd3462-374e-451...| 2025-05-08 03:17:...|{35826647, []}|                     4|10257|      Italy|
|b38d15a4-e45c-4f1...| 2025-05-08 03:17:...|{35826647, []}|                     4|13274|Netherlands|
|f03825e5-170e-40d...| 2025-05-08 03:17:...|{35826647, []}|                     4|15722|   