In [2]:
import duckdb
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv

In [3]:
load_dotenv(find_dotenv())

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_MINIO")
AWS_SECRET_KEY_ID = os.getenv("AWS_SECRET_KEY_MINIO")
HOST_MINIO = os.getenv("HOST_MINIO")
PORT_MINIO = os.getenv("PORT_MINIO")

In [4]:
con = duckdb.connect()

In [5]:
con.sql(f""" CREATE SECRET secret1 (
            TYPE S3,
            KEY_ID '{AWS_ACCESS_KEY_ID}',
            SECRET '{AWS_SECRET_KEY_ID}',
            REGION 'us-east-1',
            ENDPOINT '{HOST_MINIO}:{PORT_MINIO}',
            URL_STYLE 'path',
            USE_SSL 'false'

        );
    """)

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

In [6]:
path_land = "s3://land/uff/projeto_comex"

In [7]:
lista_importacao_exportacao = ["EXP", "IMP"]
lista_anos = [2023, 2024]

for item_lista in lista_importacao_exportacao:
    for item_ano in lista_anos:
        con.sql(f"""
                COPY (SELECT * FROM 'https://balanca.economia.gov.br/balanca/bd/comexstat-bd/ncm/{item_lista}_{item_ano}.csv')
                TO '{path_land}/{item_lista}/{item_ano}.parquet'
                """)

In [9]:
categorias = ["PAIS_BLOCO", "PAIS", "UF", "NCM"]

import requests
from io import StringIO

for item_cat in categorias:
    url = f"https://balanca.economia.gov.br/balanca/bd/tabelas/{item_cat}.csv"
    response = requests.get(url)
    csv_data = StringIO(response.text)
    df = pd.read_csv(csv_data, sep=";", encoding="ISO-8859-1")

    con.execute(f"""
            COPY (SELECT * FROM df )
            TO '{path_land}/{item_cat}.parquet'
            """)

In [17]:
con.sql(f"""
        SELECT distinct CO_ANO FROM '{path_land}/EXP/*.parquet'
        
        """)

┌────────┐
│ CO_ANO │
│ int64  │
├────────┤
│   2023 │
│   2024 │
└────────┘