### Importação de configurações e funções

In [0]:
import sys
sys.path.append("/Workspace/Users/kgenuins@emeal.nttdata.com/project-insight-lab-databricks")

from Config.spark_config import apply_storage_config
from Config.storage_config import *
from Utils.bronze_csv_loader import read_csv_with_quotes

apply_storage_config(spark)

In [0]:

from pyspark.sql.functions import (
    input_file_name,
    regexp_extract,
    current_timestamp,
    lit,
    col,
    lpad
)
import re, os

### Atribuição de variáveis e path

In [0]:
TIPO_OPERACAO_EXP = "EXP"
TIPO_OPERACAO_IMP = "IMP"
NCM = "NCM"
TB_AUX = "TB_AUX"

In [0]:
input_path = f"{balanca_comercial_path}"

In [0]:
output_path_exp = f"{bronze_path}balancacomercial/{TIPO_OPERACAO_EXP.lower()}"
output_path_imp = f"{bronze_path}balancacomercial/{TIPO_OPERACAO_IMP.lower()}"
output_path_ncm= f"{bronze_path}balancacomercial/ncm"
output_path_tb_aux = f"{bronze_path}balancacomercial/tb_aux"


In [0]:
dbutils.fs.ls(input_path)

In [0]:
dbutils.fs.ls(output_path_exp)

### Leitura e escrita de arquivos (EXP e IMP)

In [0]:
df_raw_exp = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("pathGlobFilter", "EXP_[0-9][0-9][0-9][0-9].csv")
    .option("encoding", "iso-8859-1")
    .option("quote", '\"')
    .option("escape", '\"') 
    .csv(input_path)
)

In [0]:
display(df_raw_exp.limit(10))

In [0]:
df_raw_imp = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("pathGlobFilter", "IMP_[0-9][0-9][0-9][0-9].csv")
    .option("encoding", "iso-8859-1")
    .option("quote", '\"')
    .option("escape", '\"') 
    .csv(input_path)
)

In [0]:
display(df_raw_imp.limit(10))

In [0]:
df_bronze_exp = (
    df_raw_exp
    .withColumn("origin_path_name", input_file_name())
    .withColumn("ano", col("CO_ANO").cast("int"))
    .withColumn("mes", col("CO_MES"))
    .withColumn("tipo_operacao", lit(TIPO_OPERACAO_EXP))
    .withColumn("ingestion_dt", current_timestamp())
)


In [0]:
display(df_bronze_exp.limit(10))

In [0]:
df_bronze_imp = (
    df_raw_imp
    .withColumn("origin_path_name", input_file_name())
    .withColumn("ano", col("CO_ANO").cast("int"))
    .withColumn("mes", col("CO_MES"))
    .withColumn("tipo_operacao", lit(TIPO_OPERACAO_IMP))
    .withColumn("ingestion_dt", current_timestamp())
)


In [0]:
display(df_bronze_imp.limit(10))


In [0]:
(df_bronze_exp
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("ano", "mes")
    .option("encoding", "iso-8859-1")
    .save(output_path_exp)
)


In [0]:
(df_bronze_imp
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("ano", "mes")
    .option("encoding", "iso-8859-1")
    .save(output_path_imp)
)

### Ingestão NCM

In [0]:
df_raw_ncm = (
    spark.read
    .option("header", "true")
    .option("delimiter", ";")
    .option("recursiveFileLookup", "true")
    .option("pathGlobFilter", "NCM.csv")
    .option("encoding", "iso-8859-1")
    .option("quote", '\"')
    .option("escape", '\"') 
    .csv(input_path)
)

In [0]:
df_raw_ncm.columns

In [0]:
from Utils.bronze_csv_loader import read_csv_with_quotes

In [0]:

expected_cols = ["CO_NCM","CO_UNID","CO_SH6","CO_PPE","CO_PPI","CO_FAT_AGREG","CO_CUCI_ITEM","CO_CGCE_N3","CO_SIIT","CO_ISIC_CLASSE","CO_EXP_SUBSET","NO_NCM_POR","NO_NCM_ESP","NO_NCM_ING"]

from pyspark.sql.types import StructType, StructField, StringType
schema = StructType([StructField(c, StringType(), True) for c in expected_cols])

df_raw_ncm, df_ncm_corrupt, ncm_cols = read_csv_with_quotes(
    spark=spark,
    input_path=input_path,
    delimiter=";",
    encoding="iso-8859-1",
    recursive=True,
    path_glob_filter="NCM.csv",
    header=True,
    schema=schema,               
    expected_cols=None,         
    multiline=True,
    quote="\"",
    escape="\"",
    mode="PERMISSIVE",
    corrupt_col="_corrupt_record",
    ignore_leading_trailing_ws=True,
    quarantine_path="/mnt/bronze_quarentena/ncm",
    quarantine_mode="overwrite",
)

df_raw_ncm.display()         # dados OK (delimitadores dentro de aspas ignorados)
df_ncm_corrupt.display()     # linhas problemáticas para auditoria
print(ncm_cols)


In [0]:
df_bronze_ncm = (
    df_raw_ncm
    .withColumn("origin_path_name", input_file_name())
    .withColumn("tipo_operacao", lit(NCM))
    .withColumn("ingestion_dt", current_timestamp())
)

In [0]:
display(df_bronze_ncm.limit(10))

In [0]:
(df_bronze_ncm
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("ingestion_dt")
    .option("encoding", "iso-8859-1")
    .save(output_path_ncm)
)

### Ingestão Tabelas Auxiliares

#### Leitura de todos os arquivos e filtro para tabelas auxiliares

In [0]:

arquivos_csv = []

# 1) Listar arquivos e filtrar
for f in dbutils.fs.ls(input_path):
    nome = f.name

    # Ignora não-CSV
    if not nome.lower().endswith(".csv"):
        continue

    # Ignora padrões específicos
    if re.search(r"IMP_[0-9]{4}\.csv$", nome):
        continue

    if re.search(r"EXP_[0-9]{4}\.csv$", nome):
        continue

    if re.search(r"(?i)ncm\.csv$", nome):
        continue

    arquivos_csv.append(f.path)

print(f"Arquivos CSV selecionados: {len(arquivos_csv)}")


In [0]:
expected_cols_map = {}

for file_path in arquivos_csv:
    try:
        nome_arquivo = os.path.basename(file_path)
        nome_delta = nome_arquivo.replace(".csv", "").lower()

        print(f"\nLendo arquivo: {nome_arquivo}")

        # (Opcional) Forçar colunas esperadas por nome de Delta/tabela
        expected_cols = expected_cols_map.get(nome_delta)

        # === 1. LEITURA ROBUSTA COM ASPAS ===
        # - Ignora delimitadores dentro de quotes
        # - Suporta quebra de linha em campos entre aspas
        # - Mantém linhas problemáticas em df_corrupt
        df_ok, df_corrupt, header_cols = read_csv_with_quotes(
            spark=spark,
            input_path=file_path,       # lendo arquivo individual
            delimiter=";",
            encoding="iso-8859-1",
            recursive=False,            # arquivo único, não precisa recursivo
            path_glob_filter=None,
            header=True,
            schema=None,                
            expected_cols=expected_cols, 
            multiline=True,
            quote="\"",
            escape="\"",
            mode="PERMISSIVE",
            corrupt_col="_corrupt_record",
            ignore_leading_trailing_ws=True,
            quarantine_path=f"{output_path_tb_aux}/_quarentena/{nome_delta}",  # quarentena por arquivo (opcional)
            quarantine_mode="overwrite",
        )

        # 2. Adiciona colunas extras ao DataFrame OK
        df_ok = (
            df_ok
            .withColumn("origin_path_name", input_file_name())
            .withColumn("tipo_operacao", lit(TB_AUX))
            .withColumn("ingestion_dt", current_timestamp())
        )

        # 3) Escrever como Delta
        output_path_tb = f"{output_path_tb_aux}/{nome_delta}"

        (
            df_ok
            .write
            .format("delta")
            .mode("overwrite")
            .save(output_path_tb)
        )

        # 4) Logs / métricas simples
        corrupt_count = df_corrupt.count() if df_corrupt is not None else 0
        ok_count = df_ok.count()
        print(f"Delta criado: {nome_delta}")
        print(f"Registros OK: {ok_count} | Registros corrompidos: {corrupt_count}")

    except Exception as e:
        print(f"ERRO ao processar {file_path}: {e}")
