In [None]:
import os
import shutil
import zipfile
import sys
import requests
import concurrent.futures
import urllib3
import sqlite3
import pandas as pd
from prettytable import PrettyTable
import seaborn as sb

download_dir = './tmp/'
data_dir = './data/'
extract_dir = './data/raw/'
database_dir = './data/database.db'

# Download Data

In [None]:
# Desabilita os avisos de certificado SSL
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

urls = [
  'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_brasil_2019.zip',
  'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_uf_2019.zip',
  'https://download.inep.gov.br/informacoes_estatisticas/indicadores_educacionais/2019/remuneracao_media_docentes/remuneracao_docentes_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_brasil_ideb_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_regioes_ufs_ideb_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_iniciais_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_finais_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_ensino_medio_municipios_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_iniciais_escolas_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_anos_finais_escolas_2019.zip',
  'https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2019/divulgacao_ensino_medio_escolas_2019.zip'
]

if os.path.exists(download_dir):
  shutil.rmtree(download_dir)
    
os.makedirs(download_dir, exist_ok=True)

if os.path.exists(data_dir):
  shutil.rmtree(data_dir)
    
os.makedirs(data_dir, exist_ok=True)

if os.path.exists(extract_dir):
  shutil.rmtree(extract_dir)
    
os.makedirs(extract_dir, exist_ok=True)

def download_file(url):
    filename = os.path.join(download_dir, os.path.basename(url))
    if 'google.colab' in sys.modules:
      # Se estiver no Google Colab, usa o comando !wget
      !wget -P tmp/ --no-check-certificate {url}
    else:
      with requests.get(url, stream=True, verify=False) as r:
        with open(filename, 'wb') as f:
          shutil.copyfileobj(r.raw, f)
      return filename

if __name__ == "__main__":
  with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_url = {executor.submit(download_file, url): url for url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
      url = future_to_url[future]
      try:
        filename = future.result()
        print(f"Downloaded {url} to {filename}")
      except Exception as e:
        print(f"Failed to download {url}: {e}")

  for filename in os.listdir(download_dir):
    shutil.move(os.path.join(download_dir, filename), os.path.join(extract_dir, filename))

  for filename in os.listdir(extract_dir):
    if filename.endswith(".zip"):
      zip_path = os.path.join(extract_dir, filename)
      with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        found = False
        for file_name in zip_ref.namelist():
          if file_name.endswith(".xlsx"):
            dest_filename = os.path.basename(file_name)
            dest_path = os.path.join(extract_dir, dest_filename)
            with zip_ref.open(file_name) as zip_file, open(dest_path, 'wb') as extracted_file:
              extracted_file.write(zip_file.read())
            print(f'File {file_name} extracted to {extract_dir}')
            found = True

        if not found:
          print(f'Error: No .xlsx files found in the ZIP file {zip_path}')

      os.remove(zip_path)
      print(f'ZIP file {zip_path} removed')
    else:
      print(f'The file {filename} is not a valid ZIP file')


# Database creation

In [None]:
region_column = 'region'
state_column = 'UF'
city_id_column = 'city_id'
city_name_column = 'city_name'
adm_dependence_column = 'adm_dependence'
education_column = 'teacher_education'
teacher_number_column = 'teacher_quantity'
rais_teacher_percentage_column = 'rais_teacher_percentage'
first_quartile_column = 'first_quartile'
median_column ='median'
average_column = 'average'
third_quartile_column = 'third_quartile',
standard_deviation_column = 'standard_deviation'
weekly_workload_column ='weekly_workload'
standardized_pay_40_hours_column ='standardized_pay_40_hours'

state_or_region = 'state_or_region'

school_id_column = 'school_id'
school_name_column = 'school_name'
school_type_column = 'school_type'

first_to_fifth_grade_results_column = '1st_to_5th_grade_results'
first_grade_results_column = '1st_grade_results'
second_grade_results_column = '2nd_grade_results'
third_grade_results_column = '3rd_grade_results'
fourth_grade_results_column = '4th_grade_results'
fifth_grade_results_column = '5th_grade_results'

sixth_to_nineth_grade_results_column = '6th_to_9th_grade_results'
sixth_grade_results_column = '6th_grade_results'
seventh_grade_results_column = '7th_grade_results'
eighth_grade_results_column = '8th_grade_results'
ninth_grade_results_column = '9th_grade_results'

first_to_third_high_grade_results_column = '1st_to_3rd_grade_results'
first_high_grade_results_column = '1st_grade_results'
second_high_grade_results_column = '2nd_grade_results'
third_high_grade_results_column = '3rd_grade_results'

yield_indicator_column = 'yield_indicator' # P
math_results_column = 'math_results'
portuguese_results_column = 'portuguese_results'
standardized_average_column = 'standardized_average' # N
ideb_column = 'IDEB_N_x_P'


# Verificando os arquivos extraídos
extracted_files = os.listdir(extract_dir)
extracted_files = sorted(extracted_files)

# Conectar ao banco de dados SQLite (isso criará o arquivo exemplo.db se não existir)
conn = sqlite3.connect(database_dir)


columns = [

    [
        adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # Brasil columns

    [
        region_column, state_column, city_id_column, city_name_column, adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # Municipios columns

    [
        region_column, state_column, adm_dependence_column,
        education_column, teacher_number_column, rais_teacher_percentage_column, first_quartile_column,
        median_column, average_column, third_quartile_column, standard_deviation_column, weekly_workload_column,
        standardized_pay_40_hours_column
    ], # UFs columns

    [
        state_column, city_id_column, city_name_column,
        school_id_column, school_name_column, school_type_column, sixth_to_nineth_grade_results_column, sixth_grade_results_column,
        seventh_grade_results_column, eighth_grade_results_column, ninth_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_type_column, sixth_to_nineth_grade_results_column, sixth_grade_results_column,
        seventh_grade_results_column, eighth_grade_results_column, ninth_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],

    [
        state_column, city_id_column, city_name_column, school_id_column, school_name_column,
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        school_type_column, first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column,
        school_id_column, school_name_column, school_type_column,
        first_to_third_high_grade_results_column, first_high_grade_results_column,
        second_high_grade_results_column, third_high_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_column, city_id_column, city_name_column, school_type_column,
        first_to_third_high_grade_results_column, first_high_grade_results_column,
        second_high_grade_results_column, third_high_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ],
    [
        state_or_region, school_type_column,
        first_to_fifth_grade_results_column, first_grade_results_column, second_grade_results_column,
        third_grade_results_column, fourth_grade_results_column, fifth_grade_results_column, yield_indicator_column,
        math_results_column, portuguese_results_column, standardized_average_column, ideb_column
    ]
]

primary_keys = [

    [adm_dependence_column, education_column],
    [city_id_column, adm_dependence_column, education_column],
    [state_column, adm_dependence_column, education_column],
    [school_id_column],
    [city_id_column, school_type_column],
    [school_id_column],
    [city_id_column, school_type_column],
    [school_type_column],
    [school_id_column],
    [city_id_column, school_type_column],
    [state_or_region, school_type_column]

]

dtype = [

    {
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',
    },

    {

        region_column: 'string',
        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',
    },

    {
        region_column: 'string',
        state_column: 'string',
        adm_dependence_column: 'string',
        education_column: 'string',
        teacher_number_column: 'int32',
        rais_teacher_percentage_column: 'float32',
        first_quartile_column: 'float32',
        median_column: 'float32',
        average_column: 'float32',
        third_quartile_column: 'float32',
        standard_deviation_column: 'float32' ,
        weekly_workload_column: 'float32',
        standardized_pay_40_hours_column: 'float32',

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        sixth_to_nineth_grade_results_column: 'float32',
        sixth_grade_results_column: 'float32',
        seventh_grade_results_column: 'float32',
        eighth_grade_results_column: 'float32',
        ninth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        sixth_to_nineth_grade_results_column: 'float32',
        sixth_grade_results_column: 'float32',
        seventh_grade_results_column: 'float32',
        eighth_grade_results_column: 'float32',
        ninth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {
        school_type_column: 'string',
        first_to_fifth_grade_results_column: 'float32',
        first_grade_results_column: 'float32',
        second_grade_results_column: 'float32',
        third_grade_results_column: 'float32',
        fourth_grade_results_column: 'float32',
        fifth_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_id_column: 'int32',
        school_name_column: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_column: 'string',
        city_id_column: 'int32',
        city_name_column: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },

    {

        state_or_region: 'string',
        school_type_column: 'string',
        first_to_third_high_grade_results_column: 'float32',
        first_high_grade_results_column: 'float32',
        second_high_grade_results_column: 'float32',
        third_high_grade_results_column: 'float32',
        yield_indicator_column: 'float32',
        math_results_column: 'float32',
        portuguese_results_column: 'float32',
        standardized_average_column: 'float32',
        ideb_column: 'float32'

    },


]

table_names = [
    'wage_brasil',
    'wage_per_city',
    'wage_per_state',
    'primary_schools_final_years',
    'cities_primary_schools_final_years',
    'primary_schools_early_years',
    'cities_primary_schools_early_years',
    'brasil_primary_schools_early_years',
    'high_schools',
    'cities_high_schools',
    'states_region_primary_schools_early_years'
]


intervals = [
    [('A', 'B')], [('A', 'A')], [('A', 'A')],[('G', 'AV'), ('BC', 'BW'), ('CA', 'CG'), ('CI', 'CP')],
    [('E', 'AT'), ('BA', 'BU'), ('BY', 'CE'), ('CG', 'CN')],
    [('G', 'BC'), ('BK', 'CE'), ('CI', 'CO'), ('CQ', 'CX')],
    [('E', 'BA'), ('BI', 'CC'), ('CG', 'CM'), ('CO', 'CV')],
    [('A', 'A'), ('C', 'AY'), ('BG', 'CA'), ('CE', 'CK'), ('CM', 'CT')],
    [('G', 'L'), ('Q', 'Q'), ('S', 'U'), ('Y', 'Y'), ('AA', 'AB')],
    [('E', 'J'), ('O', 'O'), ('Q', 'S'), ('W', 'W'), ('Y', 'Z')],
    [('C', 'AY'), ('BG', 'CA'), ('CE', 'CK'), ('CM', 'CT')]
]

# Função para converter letra para número de coluna no Excel
def char_to_number(letter):
    number = 0
    for i, l in enumerate(reversed(letter)):
        number += (ord(l) - 64) * (26 ** i)
    return number - 1

# Array para armazenar os índices das colunas
drop_columns = []

# Percorrendo os intervalos e adicionando os índices das colunas ao array

for interval in intervals:
    temp = []
    for begin, end in interval:
        index_begin = char_to_number(begin)
        index_end = char_to_number(end)
        temp.extend(range(index_begin, index_end + 1))
    drop_columns.append(temp)

skip_rows = [[9, 3], [9, 3], [9, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 3], [10, 6]]

unwanted = ['a', 'c', 'd', '-']

for i in range(0, 11) :
    dataframe = pd.read_excel(f'{extract_dir}{extracted_files[i]}', skiprows=skip_rows[i][0], header=None, skipfooter=skip_rows[i][1], dtype=dtype[i])

    columns_to_drop = dataframe.columns[drop_columns[i]]
    dataframe = dataframe.drop(columns_to_drop, axis=1)

    dataframe.columns = columns[i]
    

    dataframe_filtered = dataframe.dropna()

    mask_unwanted = dataframe.isin(unwanted).any(axis=1)
    dataframe_filtered = dataframe_filtered[~mask_unwanted]

    if (i == 7) or (i == 10):
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Privada \(\d\)', regex=True), school_type_column] = 'Privada'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Estadual \(\d\)', regex=True), school_type_column] = 'Estadual'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Pública \(\d\)', regex=True), school_type_column] = 'Pública'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Municipal \(\d\)', regex=True), school_type_column] = 'Municipal'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Total \(\d\)\(\d\)', regex=True), school_type_column] = 'Total'
        dataframe_filtered.loc[dataframe_filtered[school_type_column].str.contains(r'Total \(\d\)', regex=True), school_type_column] = 'Total'

    if (i == 0) or (i == 2):
        dataframe_filtered[standardized_pay_40_hours_column] = dataframe_filtered[standardized_pay_40_hours_column].astype(str).str.replace(',', '', regex=False)

    dataframe_filtered.to_sql(table_names[i], conn, index=False, if_exists='replace')

    dataframe_filtered.set_index(primary_keys[i], inplace=True)
    
    print(f'{table_names[i]} table created')

print('Import completed')
conn.close()

# Data Analysis

In [None]:
def display_df(df, query):
    table = PrettyTable()
    table.field_names = df.columns.tolist()
    for row in df.itertuples(index=False, name=None):
        table.add_row(row)
    print(f"Results for query: {query}\n")
    print(table)
    print("\n")

# Establish a connection to the SQLite database
conn = sqlite3.connect(database_dir)

# Example query to fetch data from a table
query = """
CREATE VIEW IDEB_related_to_wages AS
	SELECT 
	    cpsey.UF,
	    cpsey.city_id,
	    cpsey.city_name,
	    cpsey.school_type,
	    cpsey."1st_to_5th_grade_results",
	    cpsey.math_results AS math_grades_1st_to_5st,
	    cpsey.portuguese_results AS portuguese_grades_1st_to_5st,
	    cpsey.yield_indicator AS performance_ratio_1st_to_5st,
	    cpsey.standardized_average AS standardized_average_1st_to_5st,
	    CAST(cpsey.IDEB_N_x_P AS FLOAT) AS IDEB_1st_to_5st,
	    cpsfy."6th_to_9th_grade_results",
	    cpsfy.math_results AS math_grades_6st_to_9st,
	    cpsfy.portuguese_results AS portuguese_grades_6st_to_9st,
	    cpsfy.yield_indicator AS performance_ratio_6st_to_9st,
	    cpsfy.standardized_average AS standardized_average_6st_to_9st,
	    CAST(cpsfy.IDEB_N_x_P AS FLOAT) AS IDEB_6st_to_9st,
	    chs."1st_to_3rd_grade_results",
	    chs.math_results AS math_grades_high_school,
	    chs.portuguese_results AS portuguese_grades_high_school,
	    chs.yield_indicator AS performance_ratio_high_school,
	    chs.standardized_average AS standardized_average_high_school,
	    CAST(chs.IDEB_N_x_P AS FLOAT) AS IDEB_high_school ,
	    COALESCE(wps.teacher_quantity, wpc.teacher_quantity) AS teachers_quantity,
	    COALESCE(wps.median, wpc.median) AS wages_median,
	    COALESCE(wps.average, wpc.average) AS wages_average,
	    COALESCE(wps.standard_deviation, wpc.standard_deviation) AS wages_standard_deviation,
	    CAST(COALESCE(wps.weekly_workload, wpc.weekly_workload) AS FLOAT) AS teachers_weekly_working_hours_average,
	    CASE 
        	WHEN cpsey.school_type = 'Pública' 
        		THEN 
        			((COALESCE (CAST(wps.standardized_pay_40_hours AS FLOAT), 0) * COALESCE (CAST(wps.teacher_quantity AS FLOAT), 0)) + 
        			(COALESCE (CAST(wpc.standardized_pay_40_hours AS FLOAT), 0) * COALESCE (CAST(wpc.teacher_quantity AS FLOAT), 0))) /
        			(COALESCE (CAST(wps.teacher_quantity AS FLOAT), 0) + COALESCE (CAST(wpc.teacher_quantity AS FLOAT), 0))
    	    	ELSE CAST(COALESCE(wps.standardized_pay_40_hours, wpc.standardized_pay_40_hours) AS FLOAT)
	    END AS "wages_for_40h_working_hours"
		FROM cities_primary_schools_early_years cpsey
		LEFT JOIN cities_primary_schools_final_years cpsfy 
			ON cpsey.city_id = cpsfy.city_id AND cpsey.school_type = cpsfy.school_type
		LEFT JOIN cities_high_schools chs
			ON chs.city_id = cpsey.city_id AND chs.school_type = cpsey.school_type
		LEFT JOIN wage_per_city wpc 
			ON wpc.city_id = cpsey.city_id AND wpc.teacher_education = 'Total'
		LEFT JOIN wage_per_state wps
			ON cpsey.school_type = 'Estadual' AND wps.UF = cpsey.UF AND wps.teacher_education = 'Total' AND wps.adm_dependence = 'Estadual' 
		WHERE COALESCE(wps.standardized_pay_40_hours, wpc.standardized_pay_40_hours) IS NOT NULL 
"""

conn.execute(query)

querySelectView = "SELECT * FROM IDEB_related_to_wages"
# Execute the query and load the results into a pandas DataFrame
df = pd.read_sql_query(querySelectView, conn)

display_df(df, querySelectView)

# Close the connection
conn.close()


## High School Analysis

### National High School Analysis

In [None]:
def getNationalData(nTiles=1, tilesToUse=[]):
    conn = sqlite3.connect(database_dir)

    tiles_condition = ""
    if len(tilesToUse) > 0:
        tiles_condition = f"AND interval_position IN ({', '.join(tilesToUse)})"

    query = f"""
    WITH data AS (
        SELECT 
            *,
            NTILE({nTiles}) OVER(ORDER BY wages_for_40h_working_hours) AS interval_position
        FROM (
            SELECT * from IDEB_related_to_wages irtw
            WHERE irtw.IDEB_high_school IS NOT NULL 
            )
    )
    SELECT 
        city_name,
        UF,
        school_type,
        IDEB_1st_to_5st,
        IDEB_6st_to_9st,
        IDEB_high_school,
        teachers_weekly_working_hours_average,
        wages_for_40h_working_hours
    FROM data
    WHERE school_type = 'Pública' {tiles_condition}
    """

    data = pd.read_sql_query(query, conn)

    conn.close()

    return data

In [None]:
plotToUse = int(input("Tipo de grafico [0 - Grafico de Dispersao, 1 - Mapa de calor]: "))

if plotToUse == 0:
    ideb = int(input("IDEB selecionado [1 - Ensino fundamental I; 2 - Ensino Fundamental 2; 3 - Ensino Medio]: "))
getBySubDivisions = input("Precisa de divisoes (quantis)? [apenas enter para pular] ")

nTiles = 1
tilesToUse = []
if getBySubDivisions:
    nTiles = int(input("Numero de divisoes (Quantis): "))
    inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))
    while inputAnswer != 0:
        tilesToUse.append(str(inputAnswer))
        inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))

idebOptions = ["IDEB_1st_to_5st", "IDEB_6st_to_9st", "IDEB_high_school"]

data = getNationalData(nTiles, tilesToUse).drop(columns=['city_name', 'UF', 'school_type'])

if plotToUse == 0:
    data.plot.scatter(x="wages_for_40h_working_hours", y=idebOptions[ideb-1])
elif plotToUse == 1:
    # Correlations
    corrData = data
    corr = corrData.corr()
    sb.heatmap(corr)

    print(corr)

### Regional High School Analysis

In [None]:
def getDataframeByUFs(UFs, nTiles=1, tilesToUse=[]):
    conn = sqlite3.connect(database_dir)

    tiles_condition = ""
    if len(tilesToUse) > 0:
        tiles_condition = f"AND interval_position IN ({', '.join(tilesToUse)})"

    query = f"""
    WITH data AS (
        SELECT 
            *,
            NTILE({nTiles}) OVER(ORDER BY wages_for_40h_working_hours) AS interval_position
        FROM (
            SELECT * from IDEB_related_to_wages irtw
            WHERE irtw.IDEB_high_school IS NOT NULL 
            )
    )
    SELECT 
        "city_name",
        "UF",
        school_type,
        IDEB_1st_to_5st,
        IDEB_6st_to_9st,
        IDEB_high_school,
        teachers_weekly_working_hours_average,
        wages_for_40h_working_hours
    FROM data
    WHERE school_type = 'Pública' 
    AND UF IN ('{"', '".join(UFs)}') {tiles_condition}
    """

    data = pd.read_sql_query(query, conn)

    conn.close()

    return data

#### Custom Plot (Gráfico de Dispersão ou Mapa de calor)

In [None]:
plotToUse = int(input("Tipo de grafico [0 - Grafico de Dispersao, 1 - Mapa de calor]: "))

region = (input("Regiao a ser analisada [Norte, Nordeste, Centro-Oeste, Sudeste ou Sul]: ")).lower()
if plotToUse == 0:
    ideb = int(input("IDEB selecionado [1 - Ensino fundamental I; 2 - Ensino Fundamental 2; 3 - Ensino Medio]: "))
getBySubDivisions = input("Precisa de divisoes (quantis)? [apenas enter para pular] ")

nTiles = 1
tilesToUse = []
if getBySubDivisions:
    nTiles = int(input("Numero de divisoes (Quantis): "))
    inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))
    while inputAnswer != 0:
        tilesToUse.append(str(inputAnswer))
        inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))

regions = {
    "norte": ['AC', 'AP', 'AM', 'PA', 'RO', 'RR', 'TO'],
    "nordeste": ["AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE"],
    "centro-oeste": ["DF", "GO", "MT", "MS"],
    "sudeste": ["MG", "ES", "SP", "RJ"],
    "sul": ["PR", "SC", "RS"]
}

idebOptions = ["IDEB_1st_to_5st", "IDEB_6st_to_9st", "IDEB_high_school"]

data = getDataframeByUFs(regions[region], nTiles, tilesToUse).drop(columns=['city_name', 'UF', 'school_type'])

if plotToUse == 0:
    data.plot.scatter(x="wages_for_40h_working_hours", y=idebOptions[ideb-1])
elif plotToUse == 1:
    # Correlations
    corrData = data
    corr = corrData.corr()
    sb.heatmap(corr)

    print(corr)

#### Norte

In [None]:
# Correlations
northData = getDataframeByUFs(['AC', 'AP', 'AM', 'PA', 'RO', 'RR', 'TO']).drop(columns=['city_name', 'UF', 'school_type'])
corr = northData.corr()
sb.heatmap(corr)

print(corr)

#### Nordeste

In [None]:
# Correlations
northeastData = getDataframeByUFs(["AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE"]).drop(columns=['city_name', 'UF', 'school_type'])
corr = northeastData.corr()
sb.heatmap(corr)

print(corr)

#### Centro-Oeste

In [None]:
# Correlations
midWest = getDataframeByUFs(["DF", "GO", "MT", "MS"]).drop(columns=['city_name', 'UF', 'school_type'])
corr = midWest.corr()
sb.heatmap(corr)

print(corr)

#### Sudeste

In [None]:
# Correlations
southeast = getDataframeByUFs(["MG", "ES", "SP", "RJ"]).drop(columns=['city_name', 'UF', 'school_type'])
corr = southeast.corr()
sb.heatmap(corr)

print(corr)

#### Sul

In [None]:
# Correlations
south = getDataframeByUFs(["PR", "SC", "RS"]).drop(columns=['city_name', 'UF', 'school_type'])
corr = south.corr()
sb.heatmap(corr)

print(corr)

### By State Analysis

In [None]:
def getDataframeByUF(UF, nTiles=1, tilesToUse=[]):
    conn = sqlite3.connect(database_dir)

    tiles_condition = ""
    if len(tilesToUse) > 0:
        tiles_condition = f"AND interval_position IN ({', '.join(tilesToUse)})"

    query = f"""
    WITH data AS (
        SELECT 
            *,
            NTILE({nTiles}) OVER(ORDER BY wages_for_40h_working_hours) AS interval_position
        FROM (
            SELECT * from IDEB_related_to_wages irtw
            WHERE irtw.IDEB_high_school IS NOT NULL 
            )
    )
    SELECT 
        "city_name",
        "UF",
        school_type,
        IDEB_1st_to_5st,
        IDEB_6st_to_9st,
        IDEB_high_school,
        teachers_weekly_working_hours_average,
        wages_for_40h_working_hours
    FROM data
    WHERE school_type = 'Pública' 
    AND UF = '{UF}' {tiles_condition}
    """

    data = pd.read_sql_query(query, conn)


    conn.close()

    return data

#### Custom Plot (Gráfico de Dispersão ou Mapa de calor)

In [None]:
plotToUse = int(input("Tipo de grafico [0 - Grafico de Dispersao, 1 - Mapa de calor]: "))

state = input("Estado a ser analisado: ")
if plotToUse == 0:
    ideb = int(input("IDEB selecionado [1 - Ensino fundamental I; 2 - Ensino Fundamental 2; 3 - Ensino Medio]: "))

getBySubDivisions = input("Precisa de divisoes (quantis)? [apenas enter para pular] ")

nTiles = 1
tilesToUse = []
if getBySubDivisions:
    nTiles = int(input("Numero de divisoes (Quantis): "))
    inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))
    while inputAnswer != 0:
        tilesToUse.append(str(inputAnswer))
        inputAnswer = int(input("Utilizar o quantil numero [0 para finalizar selecao de quantis]: "))


idebOptions = ["IDEB_1st_to_5st", "IDEB_6st_to_9st", "IDEB_high_school"]

data = getDataframeByUF(state, nTiles, tilesToUse).drop(columns=['city_name', 'UF', 'school_type'])


if plotToUse == 0:
    data.plot.scatter(x="wages_for_40h_working_hours", y=idebOptions[ideb-1])
elif plotToUse == 1:
    # Correlations
    corrData = data
    corr = corrData.corr()
    sb.heatmap(corr)

    print(corr)

#### Acre

In [None]:
# Correlations
acre = getDataframeByUF("AC").drop(columns=['city_name', 'UF', 'school_type'])
corr = acre.corr()
sb.heatmap(corr)

print(corr)

#### Alagoas

In [None]:
# Correlations
alagoas = getDataframeByUF("AL").drop(columns=['city_name', 'UF', 'school_type'])
corr = alagoas.corr()
sb.heatmap(corr)

print(corr)

#### Amapá

In [None]:
# Correlations
amapa = getDataframeByUF("AP").drop(columns=['city_name', 'UF', 'school_type'])
corr = amapa.corr()
sb.heatmap(corr)

print(corr)

#### Amazonas

In [None]:
# Correlations
amazonas = getDataframeByUF("AM").drop(columns=['city_name', 'UF', 'school_type'])
corr = amazonas.corr()
sb.heatmap(corr)

print(corr)

#### Bahia

In [None]:
# Correlations
bahia = getDataframeByUF("BA").drop(columns=['city_name', 'UF', 'school_type'])
corr = bahia.corr()
sb.heatmap(corr)

print(corr)

#### Ceará

In [None]:
# Correlations
ceara = getDataframeByUF("CE").drop(columns=['city_name', 'UF', 'school_type'])
corr = ceara.corr()
sb.heatmap(corr)

print(corr)

#### Distrito Federal

In [None]:
# Correlations
df = getDataframeByUF("DF").drop(columns=['city_name', 'UF', 'school_type'])
corr = df.corr()
sb.heatmap(corr)

print(corr)

#### Espírito Santo

In [None]:
# Correlations
espirito_santo = getDataframeByUF("ES").drop(columns=['city_name', 'UF', 'school_type'])
corr = espirito_santo.corr()
sb.heatmap(corr)

print(corr)

#### Goiás

In [None]:
# Correlations
goias = getDataframeByUF("GO").drop(columns=['city_name', 'UF', 'school_type'])
corr = goias.corr()
sb.heatmap(corr)

print(corr)

#### Maranhão

In [None]:
# Correlations
maranhao = getDataframeByUF("MA").drop(columns=['city_name', 'UF', 'school_type'])
corr = maranhao.corr()
sb.heatmap(corr)

print(corr)

#### Mato Grosso

In [None]:
# Correlations
mato_grosso = getDataframeByUF("MT").drop(columns=['city_name', 'UF', 'school_type'])
corr = mato_grosso.corr()
sb.heatmap(corr)

print(corr)

#### Mato Grosso do Sul

In [None]:
# Correlations
mato_grosso_sul = getDataframeByUF("MS").drop(columns=['city_name', 'UF', 'school_type'])
corr = mato_grosso_sul.corr()
sb.heatmap(corr)

print(corr)

#### Minas Gerais

In [None]:
# Correlations
minas_gerais = getDataframeByUF("MG").drop(columns=['city_name', 'UF', 'school_type'])
corr = minas_gerais.corr()
sb.heatmap(corr)

print(corr)

#### Pará

In [None]:
# Correlations
para = getDataframeByUF("PA").drop(columns=['city_name', 'UF', 'school_type'])
corr = para.corr()
sb.heatmap(corr)

print(corr)

#### Paraíba

In [None]:
# Correlations
paraiba = getDataframeByUF("PB").drop(columns=['city_name', 'UF', 'school_type'])
corr = paraiba.corr()
sb.heatmap(corr)

print(corr)

#### Paraná

In [None]:
# Correlations
parana = getDataframeByUF("PR").drop(columns=['city_name', 'UF', 'school_type'])
corr = parana.corr()
sb.heatmap(corr)

print(corr)

#### Pernambuco

In [None]:
# Correlations
pernambuco = getDataframeByUF("PE").drop(columns=['city_name', 'UF', 'school_type'])
corr = pernambuco.corr()
sb.heatmap(corr)

print(corr)

#### Piauí

In [None]:
# Correlations
piaui = getDataframeByUF("PI").drop(columns=['city_name', 'UF', 'school_type'])
corr = piaui.corr()
sb.heatmap(corr)

print(corr)

#### Rio de Janeiro

In [None]:
# Correlations
rio_janeiro = getDataframeByUF("RJ").drop(columns=['city_name', 'UF', 'school_type'])
corr = rio_janeiro.corr()
sb.heatmap(corr)

print(corr)

#### Rio Grande do Norte

In [None]:
# Correlations
rio_grande_norte = getDataframeByUF("RN").drop(columns=['city_name', 'UF', 'school_type'])
corr = rio_grande_norte.corr()
sb.heatmap(corr)

print(corr)

#### Rio Grande do Sul

In [None]:
# Correlations
rio_grande_sul = getDataframeByUF("RS").drop(columns=['city_name', 'UF', 'school_type'])
corr = rio_grande_sul.corr()
sb.heatmap(corr)

print(corr)

#### Rondônia

In [None]:
# Correlations
rondonia = getDataframeByUF("RO").drop(columns=['city_name', 'UF', 'school_type'])
corr = rondonia.corr()
sb.heatmap(corr)

print(corr)

#### Roraima

In [None]:
# Correlations
roraima = getDataframeByUF("RR").drop(columns=['city_name', 'UF', 'school_type'])
corr = roraima.corr()
sb.heatmap(corr)

print(corr)

#### Santa Catarina

In [None]:
# Correlations
santa_catarina = getDataframeByUF("SC").drop(columns=['city_name', 'UF', 'school_type'])
corr = santa_catarina.corr()
sb.heatmap(corr)

print(corr)

#### São Paulo

In [None]:
# Correlations
sao_paulo = getDataframeByUF("SP").drop(columns=['city_name', 'UF', 'school_type'])
corr = sao_paulo.corr()
sb.heatmap(corr)

print(corr)

#### Sergipe

In [None]:
# Correlations
sergipe = getDataframeByUF("SE").drop(columns=['city_name', 'UF', 'school_type'])
corr = sergipe.corr()
sb.heatmap(corr)

print(corr)

#### Tocantins

In [None]:
# Correlations
tocantins = getDataframeByUF("TO").drop(columns=['city_name', 'UF', 'school_type'])
corr = tocantins.corr()
sb.heatmap(corr)

print(corr)