In [1]:
import basedosdados as bd
import os
import pandas as pd
from pathlib import Path

if os.environ.get('USERNAME') == "natha":

    dropbox_path = "C:/Users/natha/Dropbox/MiDES-data-paper-replication"
    github_path = "C:/Users/natha/OneDrive/Documentos/Github/data-paper"

elif os.environ.get('USER') == "nathaliasales":

    dropbox_path = "/Users/nathaliasales/Documents/MiDES-data-paper-replication"
    github_path = "/Users/nathaliasales/Documents/Github/data-paper"
    
elif os.environ.get('USERNAME') == "lucas":

    dropbox_path = ""
    github_path = ""

elif os.environ.get('USER') == "rdahis":

    dropbox_path = "/Users/rdahis/Dropbox/Academic/Papers/MiDES-data-paper-replication/"
    github_path  = "/Users/rdahis/Dropbox/Academic/Papers/MiDES-data-paper-repository"
    
input_path = os.path.join(dropbox_path, 'Data/Raw')
fig_output = os.path.join(dropbox_path, 'Output/Figures')
tab_output = os.path.join(dropbox_path, 'Output/Tables')

In [None]:

# Cross section of municipalities

def run_query_and_save_results():

    query = '''
        SELECT
            t1.id_municipio,
            t1.sigla_uf,
            t1.populacao AS populacao_2015,
            t2.pib / t1.populacao AS pib_per_capita_2015,
            t3.mortalidade_5 AS mortalidade_infantil_5_anos_2010,
            t3.taxa_agua_encanada AS percentual_agua_encanada_2010,
            t3.taxa_coleta_lixo AS percentual_coleta_lixo_2010,
            t3.taxa_energia_eletrica AS percentual_energia_eletrica_2010,
            t4_total.valor / t1.populacao AS receitas_totais_per_capita_2015,
            t4_correntes.valor / t1.populacao AS receitas_correntes_per_capita_2015,
            t4_impostos.valor / t1.populacao AS receitas_impostos_locais_per_capita_2015,
            t4_capital.valor / t1.populacao AS receitas_capital_per_capita_2015,
        FROM
            basedosdados.br_ibge_populacao.municipio AS t1
        LEFT JOIN
            basedosdados.br_ibge_pib.municipio AS t2
            ON t1.id_municipio = t2.id_municipio AND t1.ano = t2.ano
        LEFT JOIN
            basedosdados.mundo_onu_adh.municipio AS t3
            ON t1.id_municipio = t3.id_municipio AND t3.ano = 2010 -- Dados do ADH estão disponíveis apenas até 2010
        LEFT JOIN
            (
                SELECT
                    id_municipio,
                    ano,
                    SUM(valor) AS valor
                FROM
                    basedosdados.br_me_siconfi.municipio_receitas_orcamentarias
                WHERE
                    ano = 2015
                    AND estagio_bd = 'Receitas Brutas Realizadas'
                    AND id_conta_bd = '1.0.0.0.0.00.00.00' -- 'Receitas Orçamentárias'
                GROUP BY
                    id_municipio, ano
            ) AS t4_total
            ON t1.id_municipio = t4_total.id_municipio AND t1.ano = t4_total.ano
        LEFT JOIN
            (
                SELECT
                    id_municipio,
                    ano,
                    SUM(valor) AS valor
                FROM
                    basedosdados.br_me_siconfi.municipio_receitas_orcamentarias
                WHERE
                    ano = 2015
                    AND estagio_bd = 'Receitas Brutas Realizadas'
                    AND id_conta_bd = '1.1.0.0.0.00.00.00' -- 'Receitas Correntes'
                GROUP BY
                    id_municipio, ano
            ) AS t4_correntes
            ON t1.id_municipio = t4_correntes.id_municipio AND t1.ano = t4_correntes.ano
        LEFT JOIN
            (
                SELECT
                    id_municipio,
                    ano,
                    SUM(valor) AS valor
                FROM
                    basedosdados.br_me_siconfi.municipio_receitas_orcamentarias
                WHERE
                    ano = 2015
                    AND estagio_bd = 'Receitas Brutas Realizadas'
                    AND id_conta_bd = '1.1.1.0.0.00.00.00' -- Receitas Tributárias
                GROUP BY
                    id_municipio, ano
            ) AS t4_impostos
            ON t1.id_municipio = t4_impostos.id_municipio AND t1.ano = t4_impostos.ano
        LEFT JOIN
            (
                SELECT
                    id_municipio,
                    ano,
                    SUM(valor) AS valor
                FROM
                    basedosdados.br_me_siconfi.municipio_receitas_orcamentarias
                WHERE
                    ano = 2015
                    AND estagio_bd = 'Receitas Brutas Realizadas'
                    AND id_conta_bd = '1.2.0.0.0.00.00.00' -- 'Receitas de Capital'
                GROUP BY
                    id_municipio, ano
            ) AS t4_capital
            ON t1.id_municipio = t4_capital.id_municipio AND t1.ano = t4_capital.ano
        WHERE
            t1.ano = 2015
        ORDER BY
            t1.id_municipio
    '''

    municipios = bd.read_sql(query, billing_project_id=project_id_bq)

    municipios.to_csv(os.path.join(input_path,'municipios.csv'), index=False, na_rep='', float_format='%.2f')

run_query = 'True'

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
STATE_COL  = "sigla_uf"
IN_STATES  = {"RS","PR","SP","MG","CE","PB","PE"}

VARS = [
    "populacao_2015",
    "pib_per_capita_2015",
    "mortalidade_infantil_5_anos_2010",
    "percentual_agua_encanada_2010",
    "percentual_coleta_lixo_2010",
    "percentual_energia_eletrica_2010",
    "receitas_totais_per_capita_2015",
    "receitas_correntes_per_capita_2015",
    "receitas_impostos_locais_per_capita_2015",
    "receitas_capital_per_capita_2015",
]

LABELS = {
    "populacao_2015": "Population (2015)",
    "pib_per_capita_2015": "GDP per capita (2015)",
    "mortalidade_infantil_5_anos_2010": "Child mortality 5- (2010)",
    "percentual_agua_encanada_2010": "Piped water (\%) (2010)",
    "percentual_coleta_lixo_2010": "Trash collection (\%) (2010)",
    "percentual_energia_eletrica_2010": "Electricity access (\%) (2010)",
    "receitas_totais_per_capita_2015": "Total revenues p.c. (2015)",
    "receitas_correntes_per_capita_2015": "Current revenues p.c. (2015)",
    "receitas_impostos_locais_per_capita_2015": "Local tax revenues p.c. (2015)",
    "receitas_capital_per_capita_2015": "Capital revenues p.c. (2015)",
}

# ---------- Load & prepare ----------
df = pd.read_csv(os.path.join(input_path,'municipios.csv'))

# Build in-sample flag
state_codes = df[STATE_COL].astype(str).str.upper().str.strip()
in_sample = state_codes.isin(IN_STATES)

# ---------- Compute means ----------
summary = []
for col in VARS:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in CSV.")
    series = pd.to_numeric(df[col], errors="coerce")
    mean_in  = series[in_sample].mean(skipna=True)
    mean_out = series[~in_sample].mean(skipna=True)
    summary.append([LABELS[col], mean_in, mean_out])

summary_df = pd.DataFrame(summary, columns=["Variable", "In-sample", "Outside-sample"])

# ---------- LaTeX table ----------
def to_latex_booktabs(d, caption, label, float_format="%.2f"):
    body = d.copy()
    for c in body.columns[1:]:
        body[c] = body[c].map(lambda x: f"{x:,.2f}" if pd.notna(x) else "")
    cols_spec = "l" + "r"*(len(body.columns)-1)
    lines = []
    #lines.append("\\begin{table}[ht]")
    #lines.append("\\centering")
    #lines.append(f"\\caption{{{caption}}}")
    #lines.append(f"\\label{{{label}}}")
    lines.append("\\begin{tabular}{" + cols_spec + "}")
    lines.append("\\toprule")
    lines.append(" & ".join(body.columns) + " \\\\")
    lines.append("\\midrule")
    for _, row in body.iterrows():
        lines.append(" & ".join(map(str, row.values)) + " \\\\")
    lines.append("\\bottomrule")
    lines.append("\\end{tabular}")
    #lines.append("\\end{table}")
    return "\n".join(lines)

latex_code = to_latex_booktabs(
    summary_df,
    caption="Municipality averages: in-sample (RS, PR, SP, MG, CE, PB, PE) vs outside-sample",
    label="tab:in_out_sample"
)

# Save to file (optional)
out_path = os.path.join(tab_output,"descriptive_statistics_municipalities.tex")
with open(out_path, 'w', encoding='utf-8') as f:
    f.write(latex_code)

# Print for copy-paste
print(latex_code)

\begin{tabular}{lrr}
\toprule
Variable & In-sample & Outside-sample \\
\midrule
Population (2015) & 36804.56 & 36591.19 \\
GDP per capita (2015) & 21616.17 & 17346.67 \\
Child mortality 5- (2010) & 19.18 & 24.25 \\
Piped water (\%) (2010) & 87.80 & 83.05 \\
Trash collection (\%) (2010) & 96.67 & 91.01 \\
Electricity access (\%) (2010) & 99.21 & 94.86 \\
Total revenues p.c. (2015) & 3300.55 & 2942.78 \\
Current revenues p.c. (2015) & 3108.13 & 2807.96 \\
Local tax revenues p.c. (2015) & 223.98 & 185.34 \\
Capital revenues p.c. (2015) & 143.12 & 119.15 \\
\bottomrule
\end{tabular}


In [3]:
# ---------- Table with one column per state ----------

# Compute means for each state separately
summary_by_state = []
for col in VARS:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in CSV.")
    series = pd.to_numeric(df[col], errors="coerce")
    
    row = [LABELS[col]]
    # Add mean for each state
    for state in sorted(IN_STATES):  # Sort to ensure consistent order
        state_mask = state_codes == state
        mean_state = series[state_mask].mean(skipna=True)
        row.append(mean_state)
    
    # Add outside-sample mean
    mean_out = series[~in_sample].mean(skipna=True)
    row.append(mean_out)
    
    summary_by_state.append(row)

# Create DataFrame with state columns
state_columns = ["Variable"] + sorted(IN_STATES) + ["Outside-sample"]
summary_by_state_df = pd.DataFrame(summary_by_state, columns=state_columns)

# ---------- LaTeX table ----------
latex_code_by_state = to_latex_booktabs(
    summary_by_state_df,
    caption="Municipality averages by state",
    label="tab:by_state"
)

# Save to file
out_path_by_state = os.path.join(tab_output,"descriptive_statistics_municipalities_by_state.tex")
with open(out_path_by_state, 'w', encoding='utf-8') as f:
    f.write(latex_code_by_state)

# Print for copy-paste
print(latex_code_by_state)


\begin{tabular}{lrrrrrrrr}
\toprule
Variable & CE & MG & PB & PE & PR & RS & SP & Outside-sample \\
\midrule
Population (2015) & 48393.80 & 24465.53 & 17812.57 & 50514.45 & 27977.49 & 22631.73 & 68831.76 & 36591.19 \\
GDP per capita (2015) & 8844.05 & 16425.13 & 8892.95 & 10162.70 & 26046.96 & 30934.94 & 29887.32 & 17346.67 \\
Child mortality 5- (2010) & 26.54 & 19.05 & 28.72 & 28.31 & 16.37 & 14.49 & 16.70 & 24.25 \\
Piped water (\%) (2010) & 76.11 & 88.38 & 66.66 & 69.94 & 92.91 & 91.82 & 96.56 & 83.05 \\
Trash collection (\%) (2010) & 89.90 & 95.56 & 94.69 & 92.95 & 98.50 & 98.19 & 99.53 & 91.01 \\
Electricity access (\%) (2010) & 98.64 & 98.74 & 99.11 & 99.06 & 99.26 & 99.55 & 99.76 & 94.86 \\
Total revenues p.c. (2015) & 2271.10 & 2859.31 & 2715.13 & 2139.26 & 3547.15 & 4277.53 & 3796.92 & 2942.78 \\
Current revenues p.c. (2015) & 2170.23 & 2735.92 & 2586.18 & 2019.02 & 3320.44 & 3940.09 & 3578.81 & 2807.96 \\
Local tax revenues p.c. (2015) & 102.73 & 158.00 & 67.63 & 104.63 & 247