In [None]:
import os
import shutil
import zipfile
import sys
import requests
import concurrent.futures
import urllib3
import sqlite3
import pandas as pd
from prettytable import PrettyTable
import seaborn as sb

download_dir = './tmp/'
data_dir = './data/'
extract_dir = './data/raw/'
database_dir = './data/database.db'

# Download Data

In [None]:
# Desabilita os avisos de certificado SSL
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

urls = [
  'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_brasil_2019.zip',
  'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_uf_2019.zip',
  'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_brasil_ideb_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_regioes_ufs_ideb_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_iniciais_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_finais_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_ensino_medio_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_iniciais_escolas_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_finais_escolas_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_ensino_medio_escolas_2019.zip'
]

if os.path.exists(download_dir):
  shutil.rmtree(download_dir)
    
os.makedirs(download_dir, exist_ok=True)

if os.path.exists(data_dir):
  shutil.rmtree(data_dir)
    
os.makedirs(data_dir, exist_ok=True)

if os.path.exists(extract_dir):
  shutil.rmtree(extract_dir)
    
os.makedirs(extract_dir, exist_ok=True)

def download_file(url):
    filename = os.path.join(download_dir, os.path.basename(url))
    if 'google.colab' in sys.modules:
      # Se estiver no Google Colab, usa o comando !wget
      !wget -P tmp/ --no-check-certificate {url}
    else:
      with requests.get(url, stream=True, verify=False) as r:
        with open(filename, 'wb') as f:
          shutil.copyfileobj(r.raw, f)
      return filename

if __name__ == "__main__":
  with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_url = {executor.submit(download_file, url): url for url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
      url = future_to_url[future]
      try:
        filename = future.result()
        print(f"Downloaded {url} to {filename}")
      except Exception as e:
        print(f"Failed to download {url}: {e}")

  for filename in os.listdir(download_dir):
    shutil.move(os.path.join(download_dir, filename), os.path.join(extract_dir, filename))

  for filename in os.listdir(extract_dir):
    if filename.endswith(".zip"):
      zip_path = os.path.join(extract_dir, filename)
      with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        found = False
        for file_name in zip_ref.namelist():
          if file_name.endswith(".xlsx"):
            dest_filename = os.path.basename(file_name)
            dest_path = os.path.join(extract_dir, dest_filename)
            with zip_ref.open(file_name) as zip_file, open(dest_path, 'wb') as extracted_file:
              extracted_file.write(zip_file.read())
            print(f'File {file_name} extracted to {extract_dir}')
            found = True

        if not found:
          print(f'Error: No .xlsx files found in the ZIP file {zip_path}')

      os.remove(zip_path)
      print(f'ZIP file {zip_path} removed')
    else:
      print(f'The file {filename} is not a valid ZIP file')


# Database creation

In [None]:
# Definição de colunas

region_column = 'Região'
state_column = 'UF'
city_id_column = 'Código do município'
city_name_column = 'Nome do município'
adm_dependence_column = 'Dependência administrativa'
education_column = 'Escolaridade do docente'
teacher_number_column = 'Número de docentes Censo Escolar'
rais_teacher_percentage_column = '% de docentes localizados na RAIS'
first_quartile_column = '1º quartil'
median_column ='Mediana'
average_column = 'Média'
third_quartile_column = '3º quartil',
standard_deviation_column = 'Desvio padrão'
weekly_workload_column ='Carga horária média semanal'
standardized_pay_40_hours_column ='Remuneração média padronizada para 40h semanais em R$'

state_or_region = 'UF ou região'

school_id_column = 'Código da escola'
school_name_column = 'Nome da escola'
school_type_column = 'Rede'

first_to_fifth_grade_results_column = 'Resultados primeiro ao quinto ano'
first_grade_results_column = 'Resultados primeiro ano'
second_grade_results_column = 'Resultados segundo ano'
third_grade_results_column = 'Resultados terceiro ano'
fourth_grade_results_column = 'Resultados quarto ano'
fifth_grade_results_column = 'Resultados quinto ano'

sixth_to_nineth_grade_results_column = 'Resultados sexto ao nono ano'
sixth_grade_results_column = 'Resultados sexto ano'
seventh_grade_results_column = 'Resultados setimo ano'
eighth_grade_results_column = 'Resultados oitavo ano'
ninth_grade_results_column = 'Resultados nono ano'

first_to_third_high_grade_results_column = 'Resultados primeiro ao terceiro ano ensino medio'
first_high_grade_results_column = 'Resultados primeiro ano ensino medio'
second_high_grade_results_column = 'Resultados segundo ano ensino medio'
third_high_grade_results_column = 'Resultados terceiro ano ensino medio'

yield_indicator_column = 'Indicador de rendimento (P)'
math_results = 'Resultados matematica'
portuguese_results_column = 'Resultados portugues'
standardized_average_column = 'Media padronizada (N)'
ideb_column = 'IDEB (N x P)'


# Verificando os arquivos extraídos
extracted_files = os.listdir(extract_dir)
extracted_files = sorted(extracted_files)

# Conectar ao banco de dados SQLite (isso criará o arquivo exemplo.db se não existir)
conn = sqlite3.connect(database_dir)


columns = [

    [
        adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # Brasil columns

    [
        region_column, state_column, city_id_column, city_name_column, adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # Municipios columns

    [
        region_column, state_column, adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # UFs columns

    [
        state_column, city_id_column, city_name_column,
        school_id_column, school_name_column, school_type_column, sixth_to_nineth_grade_results_column, sixth_grade_results_column,
        seventh_grade_results_column, eighth_grade_results_column, ninth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_type_column, sixth_to_nineth_grade_results_column, sixth_grade_results_column,
        seventh_grade_results_column, eighth_grade_results_column, ninth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],

    [
        state_column, city_id_column, city_name_column, school_id_column, school_name_column,
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_id_column, school_name_column, school_type_column,
        first_to_third_high_grade_results_column, first_high_grade_results_column,
        second_high_grade_results_column, third_high_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column, school_type_column,
        first_to_third_high_grade_results_column, first_high_grade_results_column,
        second_high_grade_results_column, third_high_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_or_region, school_type_column,
        first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results, portuguese_results_column, standardized_average_column, ideb_column
    ]
]

dtype = [

    {
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',
    },

    {

        region_column: 'string',
        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',
    },

    {
        region_column: 'string',
        state_column: 'string',
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        sixth_to_nineth_grade_results_column: 'float32',
        sixth_grade_results_column: 'float32',
        seventh_grade_results_column: 'float32',
        eighth_grade_results_column: 'float32',
        ninth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        sixth_to_nineth_grade_results_column: 'float32',
        sixth_grade_results_column: 'float32',
        seventh_grade_results_column: 'float32',
        eighth_grade_results_column: 'float32',
        ninth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_or_region: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },


]

table_names = [
    'wage_brasil',
    'wage_per_city',
    'wage_per_state',
    'divulgacao_anos_finais_escolas_2019',
    'divulgacao_anos_finais_municipios_2019',
    'divulgacao_anos_iniciais_escolas_2019',
    'divulgacao_anos_iniciais_municipios_2019',
    'divulgacao_brasil_ideb_2019',
    'divulgacao_ensino_medio_escolas_2019',
    'divulgacao_ensino_medio_municipios_2019',
    'divulgacao_regioes_ufs_ideb'
]


intervalos = [
    [('A', 'B')], [('A', 'A')], [('A', 'A')],[('G', 'AV'), ('BC', 'BW'), ('CA', 'CG'), ('CI', 'CP')],
    [('E', 'AT'), ('BA', 'BU'), ('BY', 'CE'), ('CG', 'CN')],
    [('G', 'BC'), ('BK', 'CE'), ('CI', 'CO'), ('CQ', 'CX')],
    [('E', 'BA'), ('BI', 'CC'), ('CG', 'CM'), ('CO', 'CV')],
    [('A', 'A'), ('C', 'AY'), ('BG', 'CA'), ('CE', 'CK'), ('CM', 'CT')],
    [('G', 'L'), ('Q', 'Q'), ('S', 'U'), ('Y', 'Y'), ('AA', 'AB')],
    [('E', 'J'), ('O', 'O'), ('Q', 'S'), ('W', 'W'), ('Y', 'Z')],
    [('C', 'AY'), ('BG', 'CA'), ('CE', 'CK'), ('CM', 'CT')]
]

# Função para converter letra para número de coluna no Excel
def letra_para_numero(letra):
    numero = 0
    for i, l in enumerate(reversed(letra)):
        numero += (ord(l) - 64) * (26 ** i)
    return numero - 1

# Array para armazenar os índices das colunas
drop_columns = []

# Percorrendo os intervalos e adicionando os índices das colunas ao array

for intervalo in intervalos:
    temp = []
    for inicio, fim in intervalo:
        indice_inicio = letra_para_numero(inicio)
        indice_fim = letra_para_numero(fim)
        temp.extend(range(indice_inicio, indice_fim + 1))
    drop_columns.append(temp)

skip_rows = [[9, 3], [9, 3], [9, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 6]]

indesejados = ['a', 'c', 'd', '-']

for i in range(0, 11) :
    dataframe = pd.read_excel(f'{extract_dir}{extracted_files[i]}', skiprows=skip_rows[i][0], header=None, skipfooter=skip_rows[i][1], dtype=dtype[i])

    colunas_para_dropar = dataframe.columns[drop_columns[i]]
    dataframe = dataframe.drop(colunas_para_dropar, axis=1)

    dataframe.columns = columns[i]

    dataframe_filtered = dataframe.dropna()

    mascara_indesejados = dataframe.isin(indesejados).any(axis=1)
    dataframe_filtered = dataframe_filtered[~mascara_indesejados]

    if (i == 7) or (i == 10):
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Privada \(\d\)', regex=True), school_type_column] = 'Privada'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Estadual \(\d\)', regex=True), school_type_column] = 'Estadual'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Pública \(\d\)', regex=True), school_type_column] = 'Pública'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Municipal \(\d\)', regex=True), school_type_column] = 'Municipal'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Total \(\d\)\(\d\)', regex=True), school_type_column] = 'Total'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Total \(\d\)', regex=True), school_type_column] = 'Total'

    dataframe_filtered.to_sql(table_names[i], conn, index=False, if_exists='replace')

print('Import completed')
conn.close()

# Data Analysis

In [None]:
def display_df(df, query):
    table = PrettyTable()
    table.field_names = df.columns.tolist()
    for row in df.itertuples(index=False, name=None):
        table.add_row(row)
    print(f"Results for query: {query}\n")
    print(table)
    print("\n")

# Establish a connection to the SQLite database
conn = sqlite3.connect(database_dir)

# Example query to fetch data from a table
query = """
CREATE VIEW IDEB_related_to_wages AS
	SELECT 
	    daim.UF,
	    daim."Código do município",
	    daim."Nome do município",
	    daim.Rede,
	    daim."Resultados primeiro ao quinto ano",
	    daim."Resultados matematica" AS math_grades_1st_to_5st,
	    daim."Resultados portugues" AS portuguese_grades_1st_to_5st,
	    daim."Indicador de rendimento (P)" AS performance_ratio_1st_to_5st,
	    daim."Media padronizada (N)" AS standardized_average_1st_to_5st,
	    CAST(daim."IDEB (N x P)" AS FLOAT) AS IDEB_1st_to_5st,
	    dafm."Resultados sexto ao nono ano",
	    dafm."Resultados matematica" AS math_grades_6st_to_9st,
	    dafm."Resultados portugues" AS portuguese_grades_6st_to_9st,
	    dafm."Indicador de rendimento (P)" AS performance_ratio_6st_to_9st,
	    dafm."Media padronizada (N)" AS standardized_average_6st_to_9st,
	    CAST(dafm."IDEB (N x P)" AS FLOAT) AS IDEB_6st_to_9st,
	    demm."Resultados primeiro ao terceiro ano ensino medio",
	    demm."Resultados matematica" AS math_grades_high_school,
	    demm."Resultados portugues" AS portuguese_grades_high_school,
	    demm."Indicador de rendimento (P)" AS performance_ratio_high_school,
	    demm."Media padronizada (N)" AS standardized_average_high_school,
	    CAST(demm."IDEB (N x P)" AS FLOAT) AS IDEB_high_school ,
	    COALESCE(wps."Número de docentes Censo Escolar", wpc."Número de docentes Censo Escolar") AS teachers_quantity,
	    COALESCE(wps.Mediana, wpc.Mediana) AS wages_median,
	    COALESCE(wps.Média, wpc.Média) AS wages_average,
	    COALESCE(wps."Desvio padrão", wpc."Desvio padrão") AS wages_standard_deviation,
	    COALESCE(wps."Carga horária média semanal", wpc."Carga horária média semanal") AS teachers_weekly_working_hours_average,
	    CASE 
        	WHEN daim.Rede = 'Pública' 
        		THEN 
        			((COALESCE (CAST(wps."Remuneração média padronizada para 40h semanais em R$" AS FLOAT), 0) * COALESCE (CAST(wps."Número de docentes Censo Escolar" AS FLOAT), 0)) + 
        			(COALESCE (CAST(wpc."Remuneração média padronizada para 40h semanais em R$" AS FLOAT), 0) * COALESCE (CAST(wpc."Número de docentes Censo Escolar" AS FLOAT), 0))) /
        			(COALESCE (CAST(wps."Número de docentes Censo Escolar" AS FLOAT), 0) + COALESCE (CAST(wpc."Número de docentes Censo Escolar" AS FLOAT), 0))
    	    	ELSE CAST(COALESCE(wps."Remuneração média padronizada para 40h semanais em R$", wpc."Remuneração média padronizada para 40h semanais em R$") AS FLOAT)
	    END AS wages_for_40h_working_hours
		FROM divulgacao_anos_iniciais_municipios_2019 daim
		LEFT JOIN divulgacao_anos_finais_municipios_2019 dafm 
			ON daim."Código do município" = dafm."Código do município" AND daim.Rede = dafm.Rede
		LEFT JOIN divulgacao_ensino_medio_municipios_2019 demm
			ON demm."Código do município" = daim."Código do município" AND demm.Rede = daim.Rede
		LEFT JOIN wage_per_city wpc 
			ON wpc."Código do município" = daim."Código do município" AND wpc."Escolaridade do docente" = 'Total'
		LEFT JOIN wage_per_state wps
			ON daim.Rede = 'Estadual' AND wps.UF = daim.UF AND wps."Escolaridade do docente" = 'Total' AND wps."Dependência administrativa" = 'Estadual' 
		WHERE COALESCE(wps."Remuneração média padronizada para 40h semanais em R$", wpc."Remuneração média padronizada para 40h semanais em R$") IS NOT NULL 
"""

conn.execute(query)

querySelectView = "SELECT * FROM IDEB_related_to_wages"
# Execute the query and load the results into a pandas DataFrame
df = pd.read_sql_query(querySelectView, conn)

display_df(df, querySelectView)

# Close the connection
conn.close()


## High School Analysis

### National High School Analysis

In [None]:
def getNationalData(nTiles=1, tilesToUse=[]):
    conn = sqlite3.connect(database_dir)

    tiles_condition = ""
    if len(tilesToUse) > 0:
        tiles_condition = f"AND interval_position IN ({', '.join(tilesToUse)})"

    query = f"""
    WITH data AS (
        SELECT 
            *,
            NTILE({nTiles}) OVER(ORDER BY wages_for_40h_working_hours) AS interval_position
        FROM (
            SELECT * from IDEB_related_to_wages irtw
            WHERE irtw.IDEB_high_school IS NOT NULL 
            )
    )
    SELECT 
        "Nome do município",
        "UF",
        Rede,
        IDEB_1st_to_5st,
        IDEB_6st_to_9st,
        IDEB_high_school,
        teachers_weekly_working_hours_average,
        wages_for_40h_working_hours
    FROM data
    WHERE Rede != 'Federal' {tiles_condition}
    """

    data = pd.read_sql_query(query, conn)

    conn.close()

    return data

In [None]:
plotToUse = int(input("Tipo de grafico [0 - Grafico de Dispersao, 1 - Mapa de calor]: "))

if plotToUse == 0:
    ideb = int(input("IDEB selecionado [1 - Ensino fundamental I; 2 - Ensino Fundamental 2; 3 - Ensino Medio]: "))
getBySubDivisions = input("Precisa de divisoes (quantis)? [apenas enter para pular] ")

nTiles = 1
tilesToUse = []
if getBySubDivisions:
    nTiles = int(input("Numero de divisoes (Quantis): "))
    inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))
    while inputAnswer != 0:
        tilesToUse.append(str(inputAnswer))
        inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))

idebOptions = ["IDEB_1st_to_5st", "IDEB_6st_to_9st", "IDEB_high_school"]

data = getNationalData(nTiles, tilesToUse).drop(columns=['Nome do município', 'UF', 'Rede'])

if plotToUse == 0:
    data.plot.scatter(x="wages_for_40h_working_hours", y=idebOptions[ideb-1])
elif plotToUse == 1:
    # Correlations
    corrData = data
    corr = corrData.corr()
    sb.heatmap(corr)

    print(corr)

### Regional High School Analysis

In [None]:
def getDataframeByUFs(UFs, nTiles=1, tilesToUse=[]):
    conn = sqlite3.connect(database_dir)

    tiles_condition = ""
    if len(tilesToUse) > 0:
        tiles_condition = f"AND interval_position IN ({', '.join(tilesToUse)})"

    query = f"""
    WITH data AS (
        SELECT 
            *,
            NTILE({nTiles}) OVER(ORDER BY wages_for_40h_working_hours) AS interval_position
        FROM (
            SELECT * from IDEB_related_to_wages irtw
            WHERE irtw.IDEB_high_school IS NOT NULL 
            )
    )
    SELECT 
        "Nome do município",
        "UF",
        Rede,
        IDEB_1st_to_5st,
        IDEB_6st_to_9st,
        IDEB_high_school,
        teachers_weekly_working_hours_average,
        wages_for_40h_working_hours
    FROM data
    WHERE Rede != 'Federal' 
    AND UF IN ('{"', '".join(UFs)}') {tiles_condition}
    """

    data = pd.read_sql_query(query, conn)

    conn.close()

    return data

#### Custom Plot (Gráfico de Dispersão ou Mapa de calor)

In [None]:
plotToUse = int(input("Tipo de grafico [0 - Grafico de Dispersao, 1 - Mapa de calor]: "))

region = (input("Regiao a ser analisada [Norte, Nordeste, Centro-Oeste, Sudeste ou Sul]: ")).lower()
if plotToUse == 0:
    ideb = int(input("IDEB selecionado [1 - Ensino fundamental I; 2 - Ensino Fundamental 2; 3 - Ensino Medio]: "))
getBySubDivisions = input("Precisa de divisoes (quantis)? [apenas enter para pular] ")

nTiles = 1
tilesToUse = []
if getBySubDivisions:
    nTiles = int(input("Numero de divisoes (Quantis): "))
    inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))
    while inputAnswer != 0:
        tilesToUse.append(str(inputAnswer))
        inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))

regions = {
    "norte": ['AC', 'AP', 'AM', 'PA', 'RO', 'RR', 'TO'],
    "nordeste": ["AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE"],
    "centro-oeste": ["DF", "GO", "MT", "MS"],
    "sudeste": ["MG", "ES", "SP", "RJ"],
    "sul": ["PR", "SC", "RS"]
}

idebOptions = ["IDEB_1st_to_5st", "IDEB_6st_to_9st", "IDEB_high_school"]

data = getDataframeByUFs(regions[region], nTiles, tilesToUse).drop(columns=['Nome do município', 'UF', 'Rede'])

if plotToUse == 0:
    data.plot.scatter(x="wages_for_40h_working_hours", y=idebOptions[ideb-1])
elif plotToUse == 1:
    # Correlations
    corrData = data
    corr = corrData.corr()
    sb.heatmap(corr)

    print(corr)

#### Norte

In [None]:
# Correlations
northData = getDataframeByUFs(['AC', 'AP', 'AM', 'PA', 'RO', 'RR', 'TO']).drop(columns=['Nome do município', 'UF', 'Rede'])
corr = northData.corr()
sb.heatmap(corr)

print(corr)

#### Nordeste

In [None]:
# Correlations
northeastData = getDataframeByUFs(["AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE"]).drop(columns=['Nome do município', 'UF', 'Rede'])
corr = northeastData.corr()
sb.heatmap(corr)

print(corr)

#### Centro-Oeste

In [None]:
# Correlations
midWest = getDataframeByUFs(["DF", "GO", "MT", "MS"]).drop(columns=['Nome do município', 'UF', 'Rede'])
corr = midWest.corr()
sb.heatmap(corr)

print(corr)

#### Sudeste

In [None]:
# Correlations
southeast = getDataframeByUFs(["MG", "ES", "SP", "RJ"]).drop(columns=['Nome do município', 'UF', 'Rede'])
corr = southeast.corr()
sb.heatmap(corr)

print(corr)

#### Sul

In [None]:
# Correlations
south = getDataframeByUFs(["PR", "SC", "RS"]).drop(columns=['Nome do município', 'UF', 'Rede'])
corr = south.corr()
sb.heatmap(corr)

print(corr)

### By State Analysis

In [None]:
def getDataframeByUF(UF, nTiles=1, tilesToUse=[]):
    conn = sqlite3.connect(database_dir)

    tiles_condition = ""
    if len(tilesToUse) > 0:
        tiles_condition = f"AND interval_position IN ({', '.join(tilesToUse)})"

    query = f"""
    WITH data AS (
        SELECT 
            *,
            NTILE({nTiles}) OVER(ORDER BY wages_for_40h_working_hours) AS interval_position
        FROM (
            SELECT * from IDEB_related_to_wages irtw
            WHERE irtw.IDEB_high_school IS NOT NULL 
            )
    )
    SELECT 
        "Nome do município",
        "UF",
        Rede,
        IDEB_1st_to_5st,
        IDEB_6st_to_9st,
        IDEB_high_school,
        teachers_weekly_working_hours_average,
        wages_for_40h_working_hours
    FROM data
    WHERE Rede != 'Federal' 
    AND UF = '{UF}' {tiles_condition}
    """

    data = pd.read_sql_query(query, conn)


    conn.close()

    return data

#### Custom Plot (Gráfico de Dispersão ou Mapa de calor)

In [None]:
plotToUse = int(input("Tipo de grafico [0 - Grafico de Dispersao, 1 - Mapa de calor]: "))

state = input("Estado a ser analisado: ")
if plotToUse == 0:
    ideb = int(input("IDEB selecionado [1 - Ensino fundamental I; 2 - Ensino Fundamental 2; 3 - Ensino Medio]: "))

getBySubDivisions = input("Precisa de divisoes (quantis)? [apenas enter para pular] ")

nTiles = 1
tilesToUse = []
if getBySubDivisions:
    nTiles = int(input("Numero de divisoes (Quantis): "))
    inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))
    while inputAnswer != 0:
        tilesToUse.append(str(inputAnswer))
        inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))


idebOptions = ["IDEB_1st_to_5st", "IDEB_6st_to_9st", "IDEB_high_school"]

data = getDataframeByUF(state, nTiles, tilesToUse).drop(columns=['Nome do município', 'UF', 'Rede'])


if plotToUse == 0:
    data.plot.scatter(x="wages_for_40h_working_hours", y=idebOptions[ideb-1])
elif plotToUse == 1:
    # Correlations
    corrData = data
    corr = corrData.corr()
    sb.heatmap(corr)

    print(corr)

#### Acre

In [None]:
# Correlations
acre = getDataframeByUF("AC").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = acre.corr()
sb.heatmap(corr)

print(corr)

#### Alagoas

In [None]:
# Correlations
alagoas = getDataframeByUF("AL").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = alagoas.corr()
sb.heatmap(corr)

print(corr)

#### Amapá

In [None]:
# Correlations
amapa = getDataframeByUF("AP").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = amapa.corr()
sb.heatmap(corr)

print(corr)

#### Amazonas

In [None]:
# Correlations
amazonas = getDataframeByUF("AM").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = amazonas.corr()
sb.heatmap(corr)

print(corr)

#### Bahia

In [None]:
# Correlations
bahia = getDataframeByUF("BA").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = bahia.corr()
sb.heatmap(corr)

print(corr)

#### Ceará

In [None]:
# Correlations
ceara = getDataframeByUF("CE").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = ceara.corr()
sb.heatmap(corr)

print(corr)

#### Distrito Federal

In [None]:
# Correlations
df = getDataframeByUF("DF").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = df.corr()
sb.heatmap(corr)

print(corr)

#### Espírito Santo

In [None]:
# Correlations
espirito_santo = getDataframeByUF("ES").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = espirito_santo.corr()
sb.heatmap(corr)

print(corr)

#### Goiás

In [None]:
# Correlations
goias = getDataframeByUF("GO").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = goias.corr()
sb.heatmap(corr)

print(corr)

#### Maranhão

In [None]:
# Correlations
maranhao = getDataframeByUF("MA").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = maranhao.corr()
sb.heatmap(corr)

print(corr)

#### Mato Grosso

In [None]:
# Correlations
mato_grosso = getDataframeByUF("MT").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = mato_grosso.corr()
sb.heatmap(corr)

print(corr)

#### Mato Grosso do Sul

In [None]:
# Correlations
mato_grosso_sul = getDataframeByUF("MS").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = mato_grosso_sul.corr()
sb.heatmap(corr)

print(corr)

#### Minas Gerais

In [None]:
# Correlations
minas_gerais = getDataframeByUF("MG").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = minas_gerais.corr()
sb.heatmap(corr)

print(corr)

#### Pará

In [None]:
# Correlations
para = getDataframeByUF("PA").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = para.corr()
sb.heatmap(corr)

print(corr)

#### Paraíba

In [None]:
# Correlations
paraiba = getDataframeByUF("PB").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = paraiba.corr()
sb.heatmap(corr)

print(corr)

#### Paraná

In [None]:
# Correlations
parana = getDataframeByUF("PR").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = parana.corr()
sb.heatmap(corr)

print(corr)

#### Pernambuco

In [None]:
# Correlations
pernambuco = getDataframeByUF("PE").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = pernambuco.corr()
sb.heatmap(corr)

print(corr)

#### Piauí

In [None]:
# Correlations
piaui = getDataframeByUF("PI").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = piaui.corr()
sb.heatmap(corr)

print(corr)

#### Rio de Janeiro

In [None]:
# Correlations
rio_janeiro = getDataframeByUF("RJ").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = rio_janeiro.corr()
sb.heatmap(corr)

print(corr)

#### Rio Grande do Norte

In [None]:
# Correlations
rio_grande_norte = getDataframeByUF("RN").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = rio_grande_norte.corr()
sb.heatmap(corr)

print(corr)

#### Rio Grande do Sul

In [None]:
# Correlations
rio_grande_sul = getDataframeByUF("RS").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = rio_grande_sul.corr()
sb.heatmap(corr)

print(corr)

#### Rondônia

In [None]:
# Correlations
rondonia = getDataframeByUF("RO").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = rondonia.corr()
sb.heatmap(corr)

print(corr)

#### Roraima

In [None]:
# Correlations
roraima = getDataframeByUF("RR").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = roraima.corr()
sb.heatmap(corr)

print(corr)

#### Santa Catarina

In [None]:
# Correlations
santa_catarina = getDataframeByUF("SC").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = santa_catarina.corr()
sb.heatmap(corr)

print(corr)

#### São Paulo

In [None]:
# Correlations
sao_paulo = getDataframeByUF("SP").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = sao_paulo.corr()
sb.heatmap(corr)

print(corr)

#### Sergipe

In [None]:
# Correlations
sergipe = getDataframeByUF("SE").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = sergipe.corr()
sb.heatmap(corr)

print(corr)

#### Tocantins

In [None]:
# Correlations
tocantins = getDataframeByUF("TO").drop(columns=['Nome do município', 'UF', 'Rede'])
corr = tocantins.corr()
sb.heatmap(corr)

print(corr)