In [7]:
from typing import List, Any
import pandas as pd
import os, re

In [8]:
MERGED_PATH = "mergedfiles"
TRANSFORMED_PATH = "transformedfiles"
if not os.path.exists(TRANSFORMED_PATH):
    os.mkdir(TRANSFORMED_PATH)

In [9]:
def get_files_list(files_dir: str) -> List[str]:
    return sorted(os.listdir(files_dir))

def sub_char(value: Any) -> None | str:
    try:
        return re.sub("(')|(:)", "", value)
    except TypeError:
        return value

In [10]:
files = get_files_list(MERGED_PATH)
for i, f in enumerate(files):
    print(i, f)

0 empresas.csv
1 estabelecimentos.csv
2 id_cnae_fiscal_principal.csv
3 id_motivo_situacao_cadastral.csv
4 id_natureza_juridica.csv
5 id_qualificacoes.csv
6 municipios.csv
7 paises.csv
8 simples.csv
9 socios.csv


# Empresas

In [11]:
empresas_filename = files[0]
empresas_path = os.path.join(MERGED_PATH, empresas_filename)

In [12]:
empresas = pd.read_csv(empresas_path, chunksize=100_000, dtype={"cnpj_basico": "str"})

header = True
save_path_empresas = os.path.join(TRANSFORMED_PATH, empresas_filename)
for chunk in empresas:

    

    chunk = chunk.loc[chunk["capital_social"].str.find("capital_social") == -1]
    chunk = chunk.fillna(value={"natureza_juridica": 0,
                                "qualificacoes": 0,
                                "porte_empresa": 0})

    chunk["capital_social"] = chunk["capital_social"].apply(lambda x: x.replace(",", ".")).astype(float)

    chunk = chunk.apply(sub_char)

    chunk.to_csv(save_path_empresas, header=header, index=False, mode="a")
    header = False

# Estabelecimentos

In [13]:
estabelecimentos_filename = files[1]
estabelecimentos_path = os.path.join(MERGED_PATH, estabelecimentos_filename)

In [14]:
import warnings # disable warnings od pandas
warnings.filterwarnings("ignore")

In [15]:
estabelecimentos = pd.read_csv(estabelecimentos_path, chunksize=100_000, dtype={"cnpj_basico": "str",
                                                                                "cnpj_ordem": "str",
                                                                                "cnpj_dv": "str",
                                                                                "data_situacao_cadastral": "str",
                                                                                "data_inicio_atividade": "str",
                                                                                "cep": "str",
                                                                                "ddd1": "str",
                                                                                "telefone1": "str",
                                                                                "ddd2": "str",
                                                                                "telefone2": "str",
                                                                                "ddd_fax": "str",
                                                                                "situacao_especial": "str",
                                                                                "data_situacao_especial": "str"})

header = True
save_path_estabelecimentos = os.path.join(TRANSFORMED_PATH, estabelecimentos_filename)

for chunk in estabelecimentos:

    chunk = chunk.loc[chunk["cnpj_basico"] != "cnpj_basico"]
    chunk = chunk.fillna(value={"identificador": 0,
                                "situacao_cadastral": 1,
                                "cod_pais": 999,
                                "cod_municipio": 0,
                                "cnae_fiscal_principal": 0,
                                "motivo_situacao_cadastral": 0})
    
    chunk["cod_pais"] = chunk["cod_pais"].astype(int)

    chunk = chunk.apply(sub_char)

    chunk.to_csv(save_path_estabelecimentos, header=header, index=False, mode="a")
    header = False

# CNAE

In [16]:
cnae_filename = files[2]
cnae_path = os.path.join(MERGED_PATH, cnae_filename)

In [17]:
cnae = pd.read_csv(cnae_path)
cnae = cnae.apply(sub_char)

save_path_cnae = os.path.join(TRANSFORMED_PATH, cnae_filename)
cnae.to_csv(save_path_cnae, header=True, index=False)

# Motivo

In [18]:
motivo_filename = files[3]
motivo_path = os.path.join(MERGED_PATH, motivo_filename)

In [19]:
motivo = pd.read_csv(motivo_path)
motivo = motivo.apply(sub_char)

save_path_motivo = os.path.join(TRANSFORMED_PATH, motivo_filename)
motivo.to_csv(save_path_motivo, header=True, index=False)

# Natureza Jurídica

In [20]:
natjur_filename = files[4]
natjur_path = os.path.join(MERGED_PATH, natjur_filename)

In [21]:
natjur = pd.read_csv(natjur_path)
natjur = natjur.apply(sub_char)

save_path_natjur = os.path.join(TRANSFORMED_PATH, natjur_filename)
natjur.to_csv(save_path_natjur, header=True, index=False)

# Qualificações

In [22]:
qualificacoes_filename = files[5]
qualificacoes_path = os.path.join(MERGED_PATH, qualificacoes_filename)

In [23]:
qualificacoes = pd.read_csv(qualificacoes_path)
qualificacoes = qualificacoes.apply(sub_char)

save_path_qualificacoes = os.path.join(TRANSFORMED_PATH, qualificacoes_filename)
qualificacoes.to_csv(save_path_qualificacoes, header=True, index=False)

# Municípios

In [24]:
municipios_filename = files[6]
municipios_path = os.path.join(MERGED_PATH, municipios_filename)

In [25]:
municipios = pd.read_csv(municipios_path)
municipios = municipios.apply(sub_char)

save_path_municipios = os.path.join(TRANSFORMED_PATH, municipios_filename)
municipios.to_csv(save_path_municipios, header=True, index=False)

# Países

In [26]:
paises_filename = files[7]
paises_path = os.path.join(MERGED_PATH, paises_filename)

In [27]:
paises = pd.read_csv(paises_path)
paises = paises.apply(sub_char)

save_path_paises = os.path.join(TRANSFORMED_PATH, paises_filename)
paises.to_csv(save_path_paises, header=True, index=False)

# Simples

In [28]:
simples_filename = files[8]
simples_path = os.path.join(MERGED_PATH, simples_filename)

In [29]:
simples = pd.read_csv(simples_path, chunksize=100_000)

header = True
save_path_simples = os.path.join(TRANSFORMED_PATH, simples_filename)
for chunk in simples:

    chunk = chunk.loc[chunk["cnpj_basico"] != "cnpj_basico"]
    chunk = chunk.fillna(value={"opcao_simples": 2,
                                "opcao_mei": 2})
    
    chunk["opcao_simples"] = chunk["opcao_simples"].replace(["S", "N"], [1, 0])
    chunk["opcao_mei"] = chunk["opcao_mei"].replace(["S", "N"], [1, 0])
    
    chunk["opcao_simples"] = chunk["opcao_simples"].astype(int)
    chunk["opcao_mei"] = chunk["opcao_mei"].astype(int)

    chunk = chunk.apply(sub_char)

    chunk.to_csv(save_path_simples, header=header, index=False, mode="a")
    header = False

# Socios

In [30]:
socios_filename = files[9]
socios_path = os.path.join(MERGED_PATH, socios_filename)

In [31]:
socios = pd.read_csv(socios_path, chunksize=100_000)

header = True
save_path_socios = os.path.join(TRANSFORMED_PATH, socios_filename)
for chunk in socios:

    chunk = chunk.loc[chunk["cnpj_basico"] != "cnpj_basico"]
    chunk = chunk.fillna(value={"identificador_socio": 0,
                                "cod_pais": 999,
                                "qualificacoes": 0})
    
    chunk["cod_pais"] = chunk["cod_pais"].astype(int)

    chunk = chunk.apply(sub_char)

    chunk.to_csv(save_path_socios, header=header, index=False, mode="a")
    header = False

In [32]:
for f in files:
    os.remove(os.path.join(MERGED_PATH, f))
os.rmdir(MERGED_PATH)