# Import libraries

In [99]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import zipfile
import re
import io

# Functions

In [100]:
def get_response(url):

    response = requests.get(url)
    response.raise_for_status()

    return response

In [101]:
def get_compiled_pattern(response, pattern):

    soup = BeautifulSoup(response.text, "html.parser")

    return re.compile(pattern)

In [102]:
def get_last_folder(url):

    response = get_response(url)

    pattern = get_compiled_pattern(response, r"^\d{4}-\d{2}/$")

    folders = [a["href"].strip("/") for a in soup.find_all("a", href=pattern)]
    last_folder = sorted(folders)[-1]

    print("Última pasta encontrada:", last_folder)

    return last_folder

In [103]:
def parse_float_br(x):
        if pd.isna(x):
            return None
        return float(x.replace(".", "").replace(",", "."))

In [104]:
def get_source_data(
    endpoint,
    key,
    file_columns,
    file_dtypes
):

    last_updated_folder = get_last_folder(endpoint)

    response_folder = get_response(endpoint + last_updated_folder + "/")

    key_pattern = get_compiled_pattern(response_folder, r"^{key}\d+\.zip$".format(key=key))

    files = [a["href"] for a in soup_folder.find_all("a", href=key_pattern)]
    first_file = sorted(files, key=lambda x: int(re.search(r"(\d+)", x).group()))[0]
    print("Primeiro arquivo encontrado:", first_file)

    last_file = get_response(endpoint + last_updated_folder + "/" + first_file)

    z = zipfile.ZipFile(io.BytesIO(last_file.content))

    csv_name = z.namelist()[0]  # pega o primeiro arquivo dentro do zip
    print("Arquivo dentro do ZIP:", csv_name)

    df = pd.read_csv(
        z.open(csv_name),
        sep=";",
        encoding="latin1",
        usecols=list(range(len(file_columns))),
        names=file_columns,
        dtype=file_dtypes,
        header=None,
        low_memory=False
    )

    print("Total de linhas:", len(df))

    return df    

# Read Data

In [105]:
endpoint = "https://arquivos.receitafederal.gov.br/dados/cnpj/dados_abertos_cnpj/"

## Empresas

In [106]:
company_columns = [
        "cnpj",
        "razao_social",
        "natureza_juridica",
        "qualificacao_responsavel",
        "capital_social",
        "cod_porte"
    ]

test = list(range(len(company_columns)))

In [107]:
def get_empresas(endpoint):

    company_columns = [
        "cnpj",
        "razao_social",
        "natureza_juridica",
        "qualificacao_responsavel",
        "capital_social",
        "cod_porte"
    ]

    company_dtypes = {
        "cnpj": "string",
        "razao_social": "string",
        "natureza_juridica": "Int64", 
        "qualificacao_responsavel": "Int64",
        "capital_social": "string",
        "cod_porte": "string"
    }

    df_empresas = (
        get_source_data(
            endpoint,
            "Empresas",
            company_columns,
            company_dtypes
        )
    )

    return df_empresas


df_empresas = get_empresas(endpoint)


Última pasta encontrada: 2025-08
Primeiro arquivo encontrado: Empresas0.zip
Arquivo dentro do ZIP: K3241.K03200Y0.D50809.EMPRECSV
Total de linhas: 23537416


## Sócios

In [108]:
def get_socios(endpoint):

    partners_columns = [
        "cnpj",
        "tipo_socio",
        "nome_socio",
        "documento_socio",
        "codigo_qualificacao_socio"
    ]

    partners_dtypes = {
        "cnpj": "string",
        "tipo_socio": "Int64",
        "nome_socio": "string", 
        "documento_socio": "string",
        "codigo_qualificacao_socio": "string"
    }

    df_socios = (
        get_source_data(
            endpoint,
            "Socios",
            partners_columns,
            partners_dtypes
        )
    )

    return df_socios


df_socios = get_socios(endpoint)

Última pasta encontrada: 2025-08
Primeiro arquivo encontrado: Socios0.zip
Arquivo dentro do ZIP: K3241.K03200Y0.D50809.SOCIOCSV
Total de linhas: 8024952


# Análise Exploratória

## Empresas

In [109]:
df_empresas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23537416 entries, 0 to 23537415
Data columns (total 6 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   cnpj                      string
 1   razao_social              string
 2   natureza_juridica         Int64 
 3   qualificacao_responsavel  Int64 
 4   capital_social            string
 5   cod_porte                 string
dtypes: Int64(2), string(4)
memory usage: 1.1 GB


In [110]:
df_empresas.describe()

Unnamed: 0,natureza_juridica,qualificacao_responsavel
count,23537416.0,23537416.0
mean,2202.87361,48.607411
std,391.386809,6.621026
min,1015.0,0.0
25%,2135.0,49.0
50%,2135.0,50.0
75%,2135.0,50.0
max,8885.0,65.0


### Checando CNPJs duplicados

In [116]:
duplicates = df_empresas.duplicated(keep=False).sum()

if duplicates > 0:
    print(f"Existem {duplicates} registros duplicados")
else:
    print("Não existem CNPJs duplicados")

Não existem CNPJs duplicados


### Checando % de Nulos

In [112]:
null_percentage = (df_empresas.isna().sum() / len(df)) * 100
print(null_percentage.round(2))

cnpj                        0.00
razao_social                0.00
natureza_juridica           0.00
qualificacao_responsavel    0.00
capital_social              0.00
cod_porte                   0.02
dtype: float64


## Sócios

In [113]:
df_socios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8024952 entries, 0 to 8024951
Data columns (total 5 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   cnpj                       string
 1   tipo_socio                 Int64 
 2   nome_socio                 string
 3   documento_socio            string
 4   codigo_qualificacao_socio  string
dtypes: Int64(1), string(4)
memory usage: 313.8 MB


In [114]:
df_socios.describe()

Unnamed: 0,tipo_socio
count,8024952.0
mean,1.975939
std,0.156446
min,1.0
25%,2.0
50%,2.0
75%,2.0
max,3.0


### Tipos de Sócios

In [115]:
df_socios["tipo_socio"].value_counts()

tipo_socio
2    7823892
1     197073
3       3987
Name: count, dtype: Int64

### Checando registros de Sócios duplicados

In [117]:
duplicates = df_socios.duplicated(keep=False).sum()

if duplicates > 0:
    print(f"Existem {duplicates} registros duplicados")
else:
    print("Não existem CNPJs duplicados")

Existem 20 registros duplicados


### Checando % de Nulos

In [119]:
null_percentage = (df_socios.isna().sum() / len(df)) * 100
print(null_percentage.round(2))

cnpj                         0.00
tipo_socio                   0.00
nome_socio                   0.01
documento_socio              0.09
codigo_qualificacao_socio    0.00
dtype: float64
