In [None]:
import os
import re
import io
import csv
import pandas as pd
from pathlib import Path
from unidecode import unidecode

path = os.getcwd()

In [None]:
def infer_headers(path, sep=";", max_rows=30, min_hits=5, skiprows=0):
    match_number = re.compile(r"\d+\.\d+\.\d+")
    
    with open(path, encoding="utf-8-sig") as f:
        reader = csv.reader(f, delimiter=sep)
        
        for _ in range(skiprows):
            next(reader)
        
        rows = [r for _, r in zip(range(max_rows), reader)]
   
        
    matches = [[bool(match_number.search(c)) for c in r] for r in rows]
    n_matches = [sum(m) for m in matches]
    
    return skiprows + min([i for i, e in enumerate(n_matches) if e >= min_hits])

In [None]:
# funcao que limpa nome de colunas
def clean_names(df, i, fill=None):
    clean = (
            pd.Series(df.columns.get_level_values(i), dtype = "string")
                .str.strip()
                .str.lower()
                .str.replace(" ", "_") # replaces spaces with _
                .str.replace(r"[^\w\s]", "", regex = True) # removes /, () and []
                .map(unidecode) # remove special characters
                .replace(r"^unnamed.*", "", regex=True)
    )
    if fill is None:
        return clean
    elif fill == "simple":
        return clean.replace("", pd.NA).ffill().fillna("")
    elif fill == "inside":
        return clean.replace("", pd.NA).ffill(limit_area = "inside").fillna("")
    
    raise ValueError(f"fill must be None, 'simple', or 'inside' (got {fill!r})")

# criando funcao pra ler arquivos
def read_ifdata(path, sep=";", skiprows=None, num_headers=1):
    
    txt = Path(path).read_text(encoding="utf-8-sig")
    txt = re.sub(r";(?=\r?\n)", "", txt)  # remove trailing ';' at end of each line
    headers = list(range(num_headers))
    
    df = pd.read_csv(io.StringIO(txt), sep=sep, skiprows=skiprows, header=headers)

    if num_headers > 1:
        cols = []
        for i in headers:
            if i == 0:
                s = clean_names(df, i, fill="simple")
            elif i == 1:
                s = clean_names(df, i, fill="inside")
            else:
                s = clean_names(df, i)      
            cols.append(s)

        flat = []
        for parts in zip(*cols): # parts is a tuple like (level0, level1, level2, ...)
            parts = [p for p in parts if p]  # drop empties: (level_1, "") -> (level_1)
            out = []
            for p in parts: # iters through each part inside a tuple
                if not out or out[-1] != p: # append the first part and dont append if the last part appended is equal to the next
                    out.append(p)
            flat.append("_".join(out)) # joins everything and becomes level_1_level_2_level_3
            
        df.columns = flat
        return df
    else:
        cols = clean_names(df, 0)
        df.columns = cols
        return df

In [None]:
path_teste = path + "/dfs_by_report/passivo/inst_individuais/032000_Instituições_Individuais_Passivo.csv"

n_headers = infer_headers(
    path_teste,
    sep = ";"
)

teste = read_ifdata(
    path_teste,
    sep = ";",
    skiprows=None,
    num_headers=n_headers
)