# Download Data

In [1]:
import sys
import subprocess
import os
import shutil
import time
import zipfile
import pandas as pd
import sqlite3

urls = [
    'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_brasil_2019.zip',
    'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_uf_2019.zip',
    'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_municipios_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_brasil_ideb_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_regioes_ufs_ideb_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_iniciais_municipios_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_finais_municipios_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_ensino_medio_municipios_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_iniciais_escolas_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_finais_escolas_2019.zip',
    'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_ensino_medio_escolas_2019.zip'
]

download_dir = './tmp/'
data_dir = './data/'
extract_dir = './data/raw/'
database_dir = './data/database.db'


if os.path.exists(download_dir):
    shutil.rmtree(download_dir)

os.makedirs(download_dir, exist_ok=True)

if os.path.exists(data_dir):
    shutil.rmtree(data_dir)

os.makedirs(data_dir, exist_ok=True)

if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)

os.makedirs(extract_dir, exist_ok=True)

# Verifica o sistema operacional
if 'google.colab' in sys.modules:
    # Se estiver no Google Colab, usa o comando !wget
    for url in urls[0:]:
        !wget -P tmp/ --no-check-certificate {url}
elif sys.platform == 'win32':
    # Se estiver no Jupyter Notebook (Windows), usa o comando curl
    for url in urls:
        subprocess.run(['curl', '-O', url])
    for url in urls:
      filename = url.split("/")[-1]  # Obtém o nome do arquivo da URL
      shutil.move(filename, os.path.join("tmp", filename))

else: # Linux
    for url in urls:
      subprocess.run(['wget', '-P', f"tmp/{url.split('/')[-1]}", "--no-check-certificate", url])

arquivos_tmp = os.listdir("tmp")

for arquivo_tmp in arquivos_tmp:
    caminho_zip = os.path.join("tmp", arquivo_tmp)

    # Verifique se o arquivo é um arquivo ZIP
    if zipfile.is_zipfile(caminho_zip):

        with zipfile.ZipFile(caminho_zip, 'r') as zip_ref:
            found = False
            for file_name in zip_ref.namelist():
                if file_name.endswith(".xlsx"):
                    # Extrair o arquivo para um nome de arquivo específico no diretório de extração
                    nome_destino = os.path.basename(file_name)  # obtém apenas o nome do arquivo
                    destino_arquivo = os.path.join(extract_dir, nome_destino)
                    with zip_ref.open(file_name) as arquivo_zip, open(destino_arquivo, 'wb') as arquivo_extraido:
                        arquivo_extraido.write(arquivo_zip.read())
                    print(f'Arquivo {file_name} extraído para {extract_dir}')
                    found = True

            if not found:
                print(f'Erro: Nenhum arquivo com extensão .xlsx encontrado no arquivo ZIP {caminho_zip}')

        # Remove o arquivo ZIP após a extração
        os.remove(caminho_zip)
        print(f'Arquivo ZIP {caminho_zip} removido')
    else:
        print(f'O arquivo {arquivo_tmp} não é um arquivo ZIP válido')



Arquivo divulgacao_anos_finais_escolas_2019/divulgacao_anos_finais_escolas_2019.xlsx extraído para ./data/raw/
Arquivo ZIP tmp\divulgacao_anos_finais_escolas_2019.zip removido
Arquivo divulgacao_anos_finais_municipios_2019.xlsx extraído para ./data/raw/
Arquivo ZIP tmp\divulgacao_anos_finais_municipios_2019.zip removido
Arquivo divulgacao_anos_iniciais_escolas_2019/divulgacao_anos_iniciais_escolas_2019.xlsx extraído para ./data/raw/
Arquivo ZIP tmp\divulgacao_anos_iniciais_escolas_2019.zip removido
Arquivo divulgacao_anos_iniciais_municipios_2019.xlsx extraído para ./data/raw/
Arquivo ZIP tmp\divulgacao_anos_iniciais_municipios_2019.zip removido
Arquivo divulgacao_brasil_ideb_2019.xlsx extraído para ./data/raw/
Arquivo ZIP tmp\divulgacao_brasil_ideb_2019.zip removido
Arquivo divulgacao_ensino_medio_escolas_2019/divulgacao_ensino_medio_escolas_2019.xlsx extraído para ./data/raw/
Arquivo ZIP tmp\divulgacao_ensino_medio_escolas_2019.zip removido
Arquivo divulgacao_ensino_medio_municipios_

# Database creation

In [2]:
# Definição de colunas

region_column = 'Região'
state_column = 'UF'
city_id_column = 'Código do município'
city_name_column = 'Nome do município'
adm_dependence_column = 'Dependência administrativa'
education_column = 'Escolaridade do docente'
teacher_number_column = 'Número de docentes Censo Escolar'
rais_teacher_percentage_column = '% de docentes localizados na RAIS'
first_quartile_column = '1º quartil'
median_column ='Mediana'
average_column = 'Média'
third_quartile_column = '3º quartil',
standard_deviation_column = 'Desvio padrão'
weekly_workload_column ='Carga horária média semanal'
standardized_pay_40_hours_column ='Remuneração média padronizada para 40h semanais em R$'

state_or_region = 'UF ou região'

school_id_column = 'Código da escola'
school_name_column = 'Nome da escola'
school_type_column = 'Rede'

first_to_fifth_grade_results_column = 'Resultados primeiro ao quinto ano'
first_grade_results_column = 'Resultados primeiro ano'
second_grade_results_column = 'Resultados segundo ano'
third_grade_results_column = 'Resultados terceiro ano'
fourth_grade_results_column = 'Resultados quarto ano'
fifth_grade_results_column = 'Resultados quinto ano'

sixth_to_nineth_grade_results_column = 'Resultados sexto ao nono ano'
sixth_grade_results_column = 'Resultados sexto ano'
seventh_grade_results_column = 'Resultados setimo ano'
eighth_grade_results_column = 'Resultados oitavo ano'
ninth_grade_results_column = 'Resultados nono ano'

first_to_third_high_grade_results_column = 'Resultados primeiro ao terceiro ano ensino medio'
first_high_grade_results_column = 'Resultados primeiro ano ensino medio'
second_high_grade_results_column = 'Resultados segundo ano ensino medio'
third_high_grade_results_column = 'Resultados terceiro ano ensino medio'

yield_indicator_column = 'Indicador de rendimento (P)'
math_results = 'Resultados matematica'
portuguese_results_column = 'Resultados portugues'
standardized_average_column = 'Media padronizada (N)'
ideb_column = 'IDEB (N x P)'


# Verificando os arquivos extraídos
extracted_files = os.listdir(extract_dir)
extracted_files = sorted(extracted_files)

# Conectar ao banco de dados SQLite (isso criará o arquivo exemplo.db se não existir)
conn = sqlite3.connect(database_dir)


columns = [

    [
        adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # Brasil columns

    [
        region_column, state_column, city_id_column, city_name_column, adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # Municipios columns

    [
        region_column, state_column, adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # UFs columns

    [
        state_column, city_id_column, city_name_column,
        school_id_column, school_name_column, school_type_column, sixth_to_nineth_grade_results_column, sixth_grade_results_column,
        seventh_grade_results_column, eighth_grade_results_column, ninth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_type_column, sixth_to_nineth_grade_results_column, sixth_grade_results_column,
        seventh_grade_results_column, eighth_grade_results_column, ninth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],

    [
        state_column, city_id_column, city_name_column, school_id_column, school_name_column,
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_id_column, school_name_column, school_type_column,
        first_to_third_high_grade_results_column, first_high_grade_results_column,
        second_high_grade_results_column, third_high_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column, school_type_column,
        first_to_third_high_grade_results_column, first_high_grade_results_column,
        second_high_grade_results_column, third_high_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_or_region, school_type_column,
        first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ]
]

dtype = [

    {
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',
    },

    {

        region_column: 'string',
        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',
    },

    {
        region_column: 'string',
        state_column: 'string',
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        sixth_to_nineth_grade_results_column: 'float32',
        sixth_grade_results_column: 'float32',
        seventh_grade_results_column: 'float32',
        eighth_grade_results_column: 'float32',
        ninth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        sixth_to_nineth_grade_results_column: 'float32',
        sixth_grade_results_column: 'float32',
        seventh_grade_results_column: 'float32',
        eighth_grade_results_column: 'float32',
        ninth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_or_region: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },


]

table_names = [
    'wage_brasil',
    'wage_per_city',
    'wage_per_state',
    'divulgacao_anos_finais_escolas_2019',
    'divulgacao_anos_finais_municipios_2019',
    'divulgacao_anos_iniciais_escolas_2019',
    'divulgacao_anos_iniciais_municipios_2019',
    'divulgacao_brasil_ideb_2019',
    'divulgacao_ensino_medio_escolas_2019',
    'divulgacao_ensino_medio_municipios_2019',
    'divulgacao_regioes_ufs_ideb'
]


intervalos = [
    [('A', 'B')], [('A', 'A')], [('A', 'A')],[('G', 'AV'), ('BC', 'BW'), ('CA', 'CG'), ('CI', 'CP')],
    [('E', 'AT'), ('BA', 'BU'), ('BY', 'CE'), ('CG', 'CN')],
    [('G', 'BC'), ('BK', 'CE'), ('CI', 'CO'), ('CQ', 'CX')],
    [('E', 'BA'), ('BI', 'CC'), ('CG', 'CM'), ('CO', 'CV')],
    [('A', 'A'), ('C', 'AY'), ('BG', 'CA'), ('CE', 'CK'), ('CM', 'CT')],
    [('G', 'L'), ('Q', 'Q'), ('S', 'U'), ('Y', 'Y'), ('AA', 'AB')],
    [('E', 'J'), ('O', 'O'), ('Q', 'S'), ('W', 'W'), ('Y', 'Z')],
    [('C', 'AY'), ('BG', 'CA'), ('CE', 'CK'), ('CM', 'CT')]
]

# Função para converter letra para número de coluna no Excel
def letra_para_numero(letra):
    numero = 0
    for i, l in enumerate(reversed(letra)):
        numero += (ord(l) - 64) * (26 ** i)
    return numero - 1

# Array para armazenar os índices das colunas
drop_columns = []

# Percorrendo os intervalos e adicionando os índices das colunas ao array

for intervalo in intervalos:
    temp = []
    for inicio, fim in intervalo:
        indice_inicio = letra_para_numero(inicio)
        indice_fim = letra_para_numero(fim)
        temp.extend(range(indice_inicio, indice_fim + 1))
    drop_columns.append(temp)

skip_rows = [[9, 3], [9, 3], [9, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 6]]

indesejados = ['a', 'c', 'd', '-']

for i in range(0, 11) :
    dataframe = pd.read_excel(f'{extract_dir}{extracted_files[i]}', skiprows=skip_rows[i][0], header=None, skipfooter=skip_rows[i][1], dtype=dtype[i])

    colunas_para_dropar = dataframe.columns[drop_columns[i]]
    dataframe = dataframe.drop(colunas_para_dropar, axis=1)

    dataframe.columns = columns[i]

    dataframe_filtered = dataframe.dropna()

    mascara_indesejados = dataframe.isin(indesejados).any(axis=1)
    dataframe_filtered = dataframe_filtered[~mascara_indesejados]

    if (i == 7) or (i == 10):
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Privada \(\d\)', regex=True), school_type_column] = 'Privada'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Estadual \(\d\)', regex=True), school_type_column] = 'Estadual'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Pública \(\d\)', regex=True), school_type_column] = 'Pública'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Municipal \(\d\)', regex=True), school_type_column] = 'Municipal'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Total \(\d\)\(\d\)', regex=True), school_type_column] = 'Total'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Total \(\d\)', regex=True), school_type_column] = 'Total'

    dataframe_filtered.to_sql(table_names[i], conn, index=False, if_exists='replace')

print('Import completed')
conn.close()

Import completed


# Data Analysis

In [4]:
import sqlite3
import pandas as pd
from prettytable import PrettyTable

def display_df(df, query):
    table = PrettyTable()
    table.field_names = df.columns.tolist()
    for row in df.itertuples(index=False, name=None):
        table.add_row(row)
    print(f"Results for query: {query}\n")
    print(table)
    print("\n")

# Path to your .db file
database_dir = './data/database.db'

# Establish a connection to the SQLite database
conn = sqlite3.connect(database_dir)

# Example query to fetch data from a table
query = "SELECT * FROM wage_brasil"

# Execute the query and load the results into a pandas DataFrame
df = pd.read_sql_query(query, conn)

display_df(df, query)

# Close the connection
conn.close()


Results for query: SELECT * FROM wage_brasil

+----------------------------+-------------------------+----------------------------------+-----------------------------------+------------+----------+----------+-----------------+---------------+-----------------------------+-------------------------------------------------------+
| Dependência administrativa | Escolaridade do docente | Número de docentes Censo Escolar | % de docentes localizados na RAIS | 1º quartil | Mediana  |  Média   | ('3º quartil',) | Desvio padrão | Carga horária média semanal | Remuneração média padronizada para 40h semanais em R$ |
+----------------------------+-------------------------+----------------------------------+-----------------------------------+------------+----------+----------+-----------------+---------------+-----------------------------+-------------------------------------------------------+
|          Federal           |          Total          |              36913               |              