Extração dos dados do Enade

In [1]:
# Importar bibliotecas necessárias
import pandas as pd
import numpy as np
import zipfile
import requests
import ssl
import certifi
import urllib3
from io import BytesIO
import os
import glob

In [None]:
# Criar um diretório para armazenar o conteúdo do ENADE
os.makedirs('./enade2021', exist_ok=True)

In [None]:
# Define a url
url = 'https://download.inep.gov.br/microdados/microdados_enade_2019.zip'

# Faz o download do conteúdo
filesbytes = BytesIO(
    requests.get(url).content
)

# Extrair o conteúdo do zipfile
myzip = zipfile.Zipfile(filebytes)
myzip.extractall("./enade2021")

Verfificando os dados baixados

In [19]:
# Define a pattern to match the CSV files you want to read
file_pattern = './enade2021/2.DADOS/*.txt'

# Create an empty list to store individual DataFrames
enade = []

# Use glob to get a list of file paths that match the pattern
file_paths = glob.glob(file_pattern)

# Iterate through the list of file paths and read each CSV file
for file_path in file_paths:
    # Read CSV into a DataFrame, handle potential errors with `error_bad_lines` parameter
    try:
        df = pd.read_csv(file_path, sep = ";", decimal = ".", low_memory = False)
        enade.append(df)
    except pd.errors.EmptyDataError:
        print(f"Skipping empty file: {file_path}")
    except pd.errors.ParserError:
        print(f"Skipping file with parsing error: {file_path}")
        
sample_size = 1000  # Adjust the sample size as needed
sampled_data = []

for df in enade:
    if len(df) > sample_size:
        sampled_data.append(df.sample(n=sample_size, random_state=1))
    else:
        sampled_data.append(df)

combined_data = pd.concat(sampled_data, ignore_index=True)

combined_data_clean = combined_data[combined_data['CO_IES'].notna()]

# Sample 10 rows from the combined DataFrame
enade_amostra = combined_data_clean.sample(n=30, random_state=1)

# Display the sample
print(enade_amostra)

     NU_ANO  CO_CURSO   CO_IES  CO_CATEGAD  CO_ORGACAD  CO_GRUPO  \
507    2021    110000    143.0         5.0     10028.0    2001.0   
818    2021   1160847   3368.0         5.0     10020.0    4301.0   
452    2021   1442114  18165.0         2.0     10028.0    2001.0   
368    2021     12916    574.0         1.0     10028.0    1602.0   
242    2021   1119382    163.0         4.0     10028.0    4006.0   
929    2021   1137223   2111.0         5.0     10020.0    2001.0   
262    2021    150203    438.0         5.0     10022.0    2001.0   
810    2021      3416     57.0         2.0     10028.0    2001.0   
318    2021     98892   1491.0         4.0     10020.0    2001.0   
49     2021     34704   1365.0         5.0     10020.0     904.0   
446    2021   1075632  13488.0         5.0     10022.0    2001.0   
142    2021     89012    167.0         5.0     10028.0    4006.0   
968    2021   1174338  23410.0         2.0     10028.0    1602.0   
345    2021   1127191    583.0         1.0     1

In [None]:
enade_amostra.info()

In [None]:
dict(enade_amostra.dtypes)

In [20]:
# exportando oo dataframe para um arquivo csv
enade_amostra.to_csv('./enade2021/2.DADOS/enade.csv', index=False)