# Import libraries

In [7]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import zipfile
import re
import io

# Functions

In [1]:
def get_response(url):

    response = requests.get(url)
    response.raise_for_status()

    return response

In [2]:
def get_compiled_pattern(response, pattern):

    soup = BeautifulSoup(response.text, "html.parser")

    return re.compile(pattern), soup

In [None]:
def get_last_folder(url):

    response = get_response(url)

    pattern, soup = get_compiled_pattern(response, r"^\d{4}-\d{2}/$")

    folders = [a["href"].strip("/") for a in soup.find_all("a", href=pattern)]
    last_folder = sorted(folders)[-1]

    print("Última pasta encontrada:", last_folder)

    return last_folder

In [None]:
def get_source_data(
    endpoint,
    key,
    file_columns,
    file_dtypes
):

    last_updated_folder = get_last_folder(endpoint)

    print(last_updated_folder)

    response_folder = get_response(endpoint + last_updated_folder + "/")

    key_pattern, soup_folder = get_compiled_pattern(response_folder, r"^{key}\d+\.zip$".format(key=key))

    files = [a["href"] for a in soup_folder.find_all("a", href=key_pattern)]
    first_file = sorted(files, key=lambda x: int(re.search(r"(\d+)", x).group()))[1]
    print("Primeiro arquivo encontrado:", first_file)

    last_file = get_response(endpoint + last_updated_folder + "/" + first_file)

    z = zipfile.ZipFile(io.BytesIO(last_file.content))

    csv_name = z.namelist()[0] 
    print("Arquivo dentro do ZIP:", csv_name)

    df = pd.read_csv(
        z.open(csv_name),
        sep=";",
        encoding="latin1",
        usecols=list(range(len(file_columns))),
        names=file_columns,
        dtype=file_dtypes,
        header=None,
        low_memory=False
    )

    print("Total de linhas:", len(df))

    return df    

# Read Data

In [4]:
endpoint = "https://arquivos.receitafederal.gov.br/dados/cnpj/dados_abertos_cnpj/"

## Empresas

In [7]:
def get_empresas(endpoint):

    company_columns = [
        "cnpj",
        "razao_social",
        "natureza_juridica",
        "qualificacao_responsavel",
        "capital_social",
        "cod_porte"
    ]

    company_dtypes = {
        "cnpj": "string",
        "razao_social": "string",
        "natureza_juridica": "Int64", 
        "qualificacao_responsavel": "Int64",
        "capital_social": "string",
        "cod_porte": "string"
    }

    df_empresas = (
        get_source_data(
            endpoint,
            "Empresas",
            company_columns,
            company_dtypes
        )
    )

    return df_empresas


df_empresas = get_empresas(endpoint)


Última pasta encontrada: 2025-09
2025-09
Primeiro arquivo encontrado: Empresas1.zip
Arquivo dentro do ZIP: K3241.K03200Y1.D50913.EMPRECSV
Total de linhas: 4494860


## Sócios

In [15]:
def get_socios(endpoint):

    partners_columns = [
        "cnpj",
        "tipo_socio",
        "nome_socio",
        "documento_socio",
        "codigo_qualificacao_socio"
    ]

    partners_dtypes = {
        "cnpj": "string",
        "tipo_socio": "Int64",
        "nome_socio": "string", 
        "documento_socio": "string",
        "codigo_qualificacao_socio": "string"
    }

    df_socios = (
        get_source_data(
            endpoint,
            "Socios",
            partners_columns,
            partners_dtypes
        )
    )

    return df_socios


df_socios = get_socios(endpoint)

Última pasta encontrada: 2025-09
2025-09
Primeiro arquivo encontrado: Socios1.zip
Arquivo dentro do ZIP: K3241.K03200Y1.D50913.SOCIOCSV
Total de linhas: 2019150


# Análise Exploratória

## Empresas

In [16]:
df_empresas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4494860 entries, 0 to 4494859
Data columns (total 6 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   cnpj                      string
 1   razao_social              string
 2   natureza_juridica         Int64 
 3   qualificacao_responsavel  Int64 
 4   capital_social            string
 5   cod_porte                 string
dtypes: Int64(2), string(4)
memory usage: 214.3 MB


In [17]:
df_empresas.describe()

Unnamed: 0,natureza_juridica,qualificacao_responsavel
count,4494860.0,4494860.0
mean,2269.640283,45.696283
std,524.491856,10.662181
min,1015.0,0.0
25%,2062.0,49.0
50%,2135.0,49.0
75%,2135.0,50.0
max,8885.0,65.0


### Checando CNPJs duplicados

In [18]:
duplicates = df_empresas.duplicated(keep=False).sum()

if duplicates > 0:
    print(f"Existem {duplicates} registros duplicados")
else:
    print("Não existem CNPJs duplicados")

Não existem CNPJs duplicados


### Checando % de Nulos

In [19]:
null_percentage = (df_empresas.isna().sum() / len(df_empresas)) * 100
print(null_percentage.round(2))

cnpj                        0.0
razao_social                0.0
natureza_juridica           0.0
qualificacao_responsavel    0.0
capital_social              0.0
cod_porte                   0.0
dtype: float64


## Sócios

In [20]:
df_socios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019150 entries, 0 to 2019149
Data columns (total 5 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   cnpj                       string
 1   tipo_socio                 Int64 
 2   nome_socio                 string
 3   documento_socio            string
 4   codigo_qualificacao_socio  string
dtypes: Int64(1), string(4)
memory usage: 79.0 MB


In [21]:
df_socios.describe()

Unnamed: 0,tipo_socio
count,2019150.0
mean,1.97957
std,0.145637
min,1.0
25%,2.0
50%,2.0
75%,2.0
max,3.0


### Tipos de Sócios

In [22]:
df_socios["tipo_socio"].value_counts()

tipo_socio
2    1975481
1      42460
3       1209
Name: count, dtype: Int64

### Checando registros de Sócios duplicados

In [23]:
duplicates = df_socios.duplicated(keep=False).sum()

if duplicates > 0:
    print(f"Existem {duplicates} registros duplicados")
else:
    print("Não existem CNPJs duplicados")

Existem 2 registros duplicados


### Checando % de Nulos

In [24]:
null_percentage = (df_socios.isna().sum() / len(df_socios)) * 100
print(null_percentage.round(2))

cnpj                         0.00
tipo_socio                   0.00
nome_socio                   0.01
documento_socio              0.06
codigo_qualificacao_socio    0.00
dtype: float64
