In [1]:
import os
import requests
import logging
from pyspark.sql import SparkSession
import json
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [2]:
minio_connection = ""  

In [3]:
# carregar para funcionar
try:
    minio_conn = json.loads(minio_connection)
except json.JSONDecodeError:
    with open('../variables/minio_connection.json', "r") as minio_connection_file:
        minio_conn = json.loads(minio_connection_file.read())

In [4]:
class LazySparkSession:
    packages = [
        "io.delta:delta-spark_2.13:4.0.0",
        "org.apache.hadoop:hadoop-aws:3.4.0",
        "com.amazonaws:aws-java-sdk-bundle:1.12.787",
    ]

    def __init__(self, access_key, secret_key, endpoint):
        self._access_key = access_key
        self._secret_key = secret_key
        self._endpoint = endpoint
        

    def start(
        self,
        app_name: str = "Airflow Spark Delta Minio App",
        executor_memory: str = "1g",
        driver_memory: str = "1g",
        driver_maxresultsize: str = "1g",
        master_url: str = "local[*]",
    ):

        builder = (
            SparkSession
            .Builder()
            .appName(app_name)
            .config("spark.hadoop.fs.s3a.access.key", self._access_key)
            .config("spark.hadoop.fs.s3a.secret.key", self._secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", self._endpoint)
            .config("spark.hadoop.delta.enableFastS3AListFrom", "true")
            #
            .config("spark.executor.memory", executor_memory)
            .config("spark.driver.memory", driver_memory)
            .config("spark.driver.maxResultSize", driver_maxresultsize)
            #
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            #
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            #
            .config("spark.jars.packages", ",".join(self.packages))
            .master(master_url)
            .config("spark.ui.port", "0")
        )

        return builder.getOrCreate()

In [5]:
spark = LazySparkSession(
    access_key=minio_conn.get("access_key"), 
    secret_key=minio_conn.get("key"), 
    endpoint=minio_conn.get("endpoint")
).start()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/31 15:15:17 WARN Utils: Your hostname, DESKTOP-EDEM2DH, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/31 15:15:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/edcarlos/projeto-lakehouse/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/edcarlos/.ivy2.5.2/cache
The jars for the packages stored in: /home/edcarlos/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d91f5142-e249-4a82-bd9d-c11356c3dc7e;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central

In [6]:
# Configuração básica de logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [7]:
url = "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/serie-historica-do-levantamento-de-precos"

In [8]:
# Faz a requisição da página
response = requests.get(url)
# Levanta um erro para códigos de status HTTP ruins (4xx ou 5xx)
response.raise_for_status() 
# Faz o parser da página
soup = BeautifulSoup(response.text, 'html.parser')

In [9]:
# Extensões de arquivos buscados
extensoes = (".ods", ".xls", ".xlsx", ".csv", ".zip", ".json", ".rar")

In [10]:
# Lista para armazenar links encontrados
links_de_arquivos = []

In [11]:
# Percorre todos os <a> da página com href
for link in soup.find_all("a", href=True):
    href = link["href"]
    texto = link.get_text(strip=True).lower()

    # Verifica se termina com alguma extensão de interesse
    if any(href.lower().endswith(ext) for ext in extensoes):
        if "mensal" in href.lower() or "mensal" in texto:
        # Constrói o link absoluto se for relativo
            href_absoluto = urljoin(url, href)
            if href_absoluto not in links_de_arquivos:
                links_de_arquivos.append(href_absoluto)

In [12]:
# Lista extraidos com o BeautifulSoup
links = [
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/2001-2012/mensal-brasil-2001-a-2012.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/2001-2012/mensal-regioes-2001-a-2012.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/2001-2012/mensal-estados-2001-a-2012.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-brasil-desde-jan2013.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-regioes-desde-jan2013.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-estados-desde-jan2013.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-municipios-2013-a-2015.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-municipios-2016-a-2018.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-municipios-2019-a-2021.xlsx"
    "https://www.gov.br/anp/pt-br/assuntos/precos-e-defesa-da-concorrencia/precos/precos-revenda-e-de-distribuicao-combustiveis/shlp/mensal/mensal-municipios-jan2022-2025.xlsx"
]


In [13]:
def baixar_arquivos_com_metadados(links, pasta_destino="download"):
    os.makedirs(pasta_destino, exist_ok=True)
    registros = []

    for link in links:
        nome_arquivo = link.split("/")[-1]
        caminho_arquivo = os.path.join(pasta_destino, nome_arquivo)
        data_download = datetime.now().isoformat()

        try:
            resposta = requests.get(link)
            if resposta.status_code == 200:
                with open(caminho_arquivo, "wb") as f:
                    f.write(resposta.content)
                logging.info(f"Baixado: {nome_arquivo}")

                registros.append({
                    "nome_arquivo": nome_arquivo,
                    "link_origem": link,
                    "caminho_local": caminho_arquivo,
                    "data_download": data_download
                })
            else:
                logging.warning(f"Erro ao baixar: {link} (Status {resposta.status_code})")
        except Exception as e:
            logging.error(f"Erro ao baixar {link} → {e}", exc_info=True)

    return registros

In [14]:
registros_extraidos = baixar_arquivos_com_metadados(links_de_arquivos)

2025-07-31 15:15:29,375 - INFO - Baixado: mensal-brasil-2001-a-2012.xlsx
2025-07-31 15:15:30,339 - INFO - Baixado: mensal-regioes-2001-a-2012.xlsx
2025-07-31 15:15:32,403 - INFO - Baixado: mensal-estados-2001-a-2012.xlsx
2025-07-31 15:15:32,973 - INFO - Baixado: mensal-brasil-desde-jan2013.xlsx
2025-07-31 15:15:33,954 - INFO - Baixado: mensal-regioes-desde-jan2013.xlsx
2025-07-31 15:15:36,120 - INFO - Baixado: mensal-estados-desde-jan2013.xlsx
2025-07-31 15:15:43,563 - INFO - Baixado: mensal-municipios-2013-a-2015.xlsx
2025-07-31 15:15:49,223 - INFO - Baixado: mensal-municipios-2016-a-2018.xlsx
2025-07-31 15:15:54,852 - INFO - Baixado: mensal-municipios-2019-a-2021.xlsx
2025-07-31 15:16:01,582 - INFO - Baixado: mensal-municipios-jan2022-2025.xlsx


In [15]:
df = spark.createDataFrame(registros_extraidos)
df.write.format("delta").mode("overwrite").save("s3a://landing/anp/serie_levantamento_precos")
spark.stop()

25/07/31 15:16:03 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
25/07/31 15:16:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/31 15:16:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/07/31 15:16:11 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to anp/serie_levantamento_precos/part-00007-78d574e8-9a65-434b-bf5a-92620c0d928f-c000.snappy.parquet. This is Unsupported
                                                                            