In [15]:
import duckdb
import os
from dotenv import load_dotenv, find_dotenv
from deltalake import write_deltalake

load_dotenv(find_dotenv())

con = duckdb.connect()

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_MINIO")
AWS_SECRET_KEY_ID = os.getenv("AWS_SECRET_KEY_MINIO")
HOST_MINIO = os.getenv("HOST_MINIO")
PORT_MINIO = os.getenv("PORT_MINIO")

con.sql(f""" CREATE SECRET secret1 (
            TYPE S3,
            KEY_ID '{AWS_ACCESS_KEY_ID}',
            SECRET '{AWS_SECRET_KEY_ID}',
            REGION 'us-east-1',
            ENDPOINT '{HOST_MINIO}:{PORT_MINIO}',
            URL_STYLE 'path',
            USE_SSL 'false'

        );
    """)

storage_options = {
    "AWS_ACCESS_KEY_ID": f"{AWS_ACCESS_KEY_ID}",
    "AWS_SECRET_ACCESS_KEY": f"{AWS_SECRET_KEY_ID}",
    "AWS_ENDPOINT_URL": f"http://{HOST_MINIO}:{PORT_MINIO}",
    "AWS_allow_http": "true",
    "AWS_REGION": "us-east-1",
    "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
}

In [16]:
path_land = "s3://land/uff/projeto_comex"
path_bronze = "s3://bronze/uff/projeto_comex"

In [22]:
df = con.sql(f""" 
           SELECT
           CAST(CO_ANO AS INT) AS ANO,
           CAST(CO_MES AS INT) AS MES,
           CAST(CO_NCM AS INT) AS FK_NCM,
           CAST(CO_PAIS AS INT) AS FK_PAIS,
           SG_UF_NCM AS FK_SIGA_ESTADO,
           QT_ESTAT AS QUANTIDADE,
           KG_LIQUIDO AS KG_LIQUIDO,
           VL_FOB AS VALOR_MERCADORIA,
           VL_FRETE AS VALOR_FRETE,
           VL_SEGURO AS VALOR_SEGURO
           FROM '{path_land}/IMP/*.parquet'
           """).to_arrow_table()

In [24]:
write_deltalake(
    f"{path_bronze}/IMP",
    df,
    mode="append",
    storage_options=storage_options,
    partition_by=["ANO"],
)

In [None]:
max_ano = con.sql("SELECT MAX(ANO) FROM df").to_df().squeeze()

max_mes = con.sql(f"SELECT MAX(MES) FROM df WHERE ANO = {max_ano}").to_df().squeeze()

np.int32(10)

In [34]:
df = con.sql(f""" 
           SELECT
           CAST(CO_ANO AS INT) AS ANO,
           CAST(CO_MES AS INT) AS MES,
           CAST(CO_NCM AS INT) AS FK_NCM,
           CAST(CO_PAIS AS INT) AS FK_PAIS,
           SG_UF_NCM AS FK_SIGA_ESTADO,
           QT_ESTAT AS QUANTIDADE,
           KG_LIQUIDO AS KG_LIQUIDO,
           VL_FOB AS VALOR_MERCADORIA,
           VL_FRETE AS VALOR_FRETE,
           VL_SEGURO AS VALOR_SEGURO
           FROM '{path_land}/IMP/*.parquet'
           WHERE CAST(CO_ANO AS INT) > {max_ano} OR (   
           CAST(CO_ANO AS INT) = {max_ano} AND 
           CAST(CO_MES AS INT) > {max_mes}
           )
           """).to_arrow_table()

In [37]:
if len(df) > 0:
    write_deltalake(
        f"{path_bronze}/IMP",
        df,
        mode="append",
        storage_options=storage_options,
        partition_by=["ANO"],
    )

In [38]:
con.close()