### Importação de configurações e funções

In [0]:
%run /Workspace/Users/kgenuins@emeal.nttdata.com/project-insight-lab-databricks/Config/storage_config

In [0]:
%run /Workspace/Users/kgenuins@emeal.nttdata.com/project-insight-lab-databricks/Config/secrets_config

In [0]:

from pyspark.sql.functions import (
    input_file_name,
    regexp_extract,
    current_timestamp,
    lit,
    col,
    lpad
)

### Atribuição de variáveis e path

In [0]:
TIPO_OPERACAO_EXP = "EXP"
TIPO_OPERACAO_IMP = "IMP"
NCM = "NCM"
TB_AUX = "TB_AUX"

In [0]:
input_path = f"{balanca_comercial_path}"

In [0]:
output_path_exp = f"{bronze_path}/balancacomercial/{TIPO_OPERACAO_EXP.lower()}"
output_path_imp = f"{bronze_path}/balancacomercial/{TIPO_OPERACAO_IMP.lower()}"
output_path_ncm= f"{bronze_path}/balancacomercial/ncm"
output_path_tb_aux = f"{bronze_path}/balancacomercial/tb_aux"


In [0]:
dbutils.fs.ls(balanca_comercial_path)

### Leitura e escrita de arquivos (EXP e IMP)

In [0]:
df_raw_exp = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("pathGlobFilter", "EXP*")
    .csv(input_path)
)

In [0]:
display(df_raw_exp.limit(10))

In [0]:
df_raw_imp = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("pathGlobFilter", "IMP*")
    .csv(input_path)
)

In [0]:
display(df_raw_imp.limit(10))

In [0]:
df_bronze_exp = (
    df_raw_exp
    .withColumn("file_name", input_file_name())
    .withColumn("ano", col("CO_ANO").cast("int"))
    .withColumn("mes", col("CO_MES"))
    .withColumn("tipo_operacao", lit(TIPO_OPERACAO_EXP))
    .withColumn("data_ingestao", current_timestamp())
)


In [0]:
display(df_bronze_exp.limit(10))

In [0]:
df_bronze_imp = (
    df_raw_imp
    .withColumn("file_name", input_file_name())
    .withColumn("ano", col("CO_ANO").cast("int"))
    .withColumn("mes", col("CO_MES"))
    .withColumn("tipo_operacao", lit(TIPO_OPERACAO_IMP))
    .withColumn("data_ingestao", current_timestamp())
)


In [0]:
display(df_bronze_imp.limit(10))


In [0]:
(df_bronze_exp
    .write
    .format("delta")
    .mode("append")
    .partitionBy("ano", "mes")
    .save(output_path_exp)
)


In [0]:
(df_bronze_imp
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("ano", "mes")
    .save(output_path_imp)
)

### Ingestão NCM

In [0]:
df_raw_ncm = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("pathGlobFilter", "NCM.csv")
    .option("encoding", "cp1252")
    .csv(input_path)
)

In [0]:
df_bronze_ncm = (
    df_raw_ncm
    .withColumn("file_name", input_file_name())
    .withColumn("tipo_operacao", lit(NCM))
    .withColumn("data_ingestao", current_timestamp())
)

In [0]:
display(df_raw_ncm.limit(10))

In [0]:
(df_bronze_ncm
    .write
    .format("delta")
    .mode("overwrite")
    .save(output_path_ncm)
)

### Ingestão Tabelas Auxiliares

In [0]:
df_raw_all = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("encoding", "cp1252")
    .csv(input_path)
)

#### Leitura de todos os arquivos e filtro para tabelas auxiliares

In [0]:
df_raw_tb_aux = (
    df_raw_all
    .withColumn("file_name", input_file_name())
    .filter(
        (~col("file_name").rlike("/IMP")) &
        (~col("file_name").rlike("/EXP")) &
        (~col("file_name").rlike("(?i)ncm\\.csv$"))
    )
    .withColumn("tipo_operacao", lit(TB_AUX))
    .withColumn("data_ingestao", current_timestamp())
    .withColumn(
        "nome",
        regexp_extract(col("file_name"), r"([^/]+$)", 1)
    )
)

In [0]:
# Verificação de arquivos após o filtro
display(df_raw_tb_aux.groupby("arquivo_origem").count())

In [0]:
arquivos_csv = (
    df_raw_tb_aux
    .select("nome")
    .distinct()
    .collect()
)

#### Escrita por nome do arquivo .csv

In [0]:
for row in arquivos_csv:
    nome_arquivo = row["nome"]            
    nome_arquivo_csv = nome_arquivo.replace(".csv", "").lower()
    
    df_file = df_raw_tb_aux.filter(col("nome") == nome_arquivo)
    
    output_path_tb = f"{output_path_tb_aux}/{nome_arquivo_csv}"
    
    (
        df_file
        .drop("nome")
        .write
        .format("delta")
        .mode("overwrite")   # referência → overwrite OK
        .save(output_path_tb)
    )
    
    print(f"Delta criado: {nome_arquivo_csv}")