In [None]:
# Directory 
input_path = os.environ.get('input_path')
fig_output = os.environ.get('fig_output')
tab_output = os.environ.get('tab_output')

# Queries config
project_id_bq = os.environ.get('project_id_bq')
run_query = os.environ.get('run_query')

## Commitment | Year - Municipality

In [None]:
# Total of commitments by year and municipality compared to SICONFI

def run_query_and_save_results():

    query = '''
    WITH commitment AS (
      SELECT
        ano,
        id_municipio,
        SUM(valor_final) AS total_commitment
      FROM basedosdados.world_wb_mides.empenho
      WHERE ano >= 2003
      GROUP BY 1,2
    ),
      siconfi AS (
      SELECT
      ano,
      sigla_uf,
      id_municipio,
      SUM(valor) AS total_siconfi
      FROM `basedosdados.br_me_siconfi.municipio_despesas_orcamentarias`
      WHERE sigla_uf IN ('CE', 'MG', 'PB', 'PE', 'PR', 'RS', 'SP') AND ano >= 2003 AND estagio_bd = 'Despesas Empenhadas' AND conta_bd = 'Despesas Orçamentárias'
      GROUP BY 1,2,3
      ORDER BY 1
    )

    SELECT
      e.ano,
      sigla_uf,
      e.id_municipio,
      total_commitment,
      total_siconfi,
      ROUND(total_commitment - total_siconfi,2) AS variation
    FROM commitment e
    LEFT JOIN siconfi s ON e.ano=s.ano AND e.id_municipio=s.id_municipio
    WHERE total_siconfi IS NOT NULL

    '''

    commitmet_siconfi = bd.read_sql(query, billing_project_id=project_id_bq)
    # The 'proportion' variable shows the difference in the variation between commitment and siconfi over the total commitment
    commitmet_siconfi['proportion'] = 100*commitmet_siconfi['variation']/commitmet_siconfi['total_commitment']
    # Remove null percent values and restrict the RS data from 2010
    commitmet_siconfi = commitmet_siconfi.query('proportion != -inf')
    commitmet_siconfi = commitmet_siconfi.drop(commitmet_siconfi[(commitmet_siconfi['sigla_uf'] == 'RS') & (commitmet_siconfi['ano'] < 2010)].index)

    commitment_siconfi.to_csv(os.path.join(input_path,'commitment_municipality_year.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Total of commitments by year, municipality and function compared to SICONFI

def run_query_and_save_results():

    query2 = '''
    WITH commitment AS (
      SELECT
        ano,
        sigla_uf,
        id_municipio,
        funcao,
        SUM (valor_final) AS total_commitment
      FROM basedosdados.world_wb_mides.empenho
      GROUP BY 1, 2, 3, 4
    ),
     siconfi AS (
      SELECT
        ano,
        sigla_uf,
        id_municipio,
        CASE WHEN (UPPER(conta_bd) = 'LEGISLATIVA')             THEN '1'
             WHEN (UPPER(conta_bd) = 'JUDICIÁRIA')              THEN '2'
             WHEN (UPPER(conta_bd) = 'ESSENCIAL À JUSTIÇA')     THEN '3'
             WHEN (UPPER(conta_bd) = 'ADMINISTRAÇÃO')           THEN '4'
             WHEN (UPPER(conta_bd) = 'DEFESA NACIONAL')         THEN '5'
             WHEN (UPPER(conta_bd) = 'SEGURANÇA PÚBLICA')       THEN '6'
             WHEN (UPPER(conta_bd) = 'RELAÇÕES EXTERIORES')     THEN '7'
             WHEN (UPPER(conta_bd) = 'ASSISTÊNCIA SOCIAL')      THEN '8'
             WHEN (UPPER(conta_bd) = 'PREVIDÊNCIA SOCIAL')      THEN '9'
             WHEN (UPPER(conta_bd) = 'SAÚDE')                   THEN '10'
             WHEN (UPPER(conta_bd) = 'TRABALHO')                THEN '11'
             WHEN (UPPER(conta_bd) = 'EDUCAÇÃO')                THEN '12'
             WHEN (UPPER(conta_bd) = 'CULTURA')                 THEN '13'
             WHEN (UPPER(conta_bd) = 'DIREITOS DA CIDADANIA')   THEN '14'
             WHEN (UPPER(conta_bd) = 'URBANISMO')               THEN '15'
             WHEN (UPPER(conta_bd) = 'HABITAÇÃO')               THEN '16'
             WHEN (UPPER(conta_bd) = 'SANEAMENTO')              THEN '17'
             WHEN (UPPER(conta_bd) = 'GESTÃO AMBIENTAL')        THEN '18'
             WHEN (UPPER(conta_bd) = 'CIÊNCIA E TECNOLOGIA')    THEN '19'
             WHEN (UPPER(conta_bd) = 'AGRICULTURA')             THEN '20'
             WHEN (UPPER(conta_bd) = 'ORGANIZAÇÃO AGRÁRIA')     THEN '21'
             WHEN (UPPER(conta_bd) = 'INDÚSTRIA')               THEN '22'
             WHEN (UPPER(conta_bd) = 'COMÉRCIO E SERVIÇOS')     THEN '23'
             WHEN (UPPER(conta_bd) = 'COMUNICAÇÕES')            THEN '24'
             WHEN (UPPER(conta_bd) = 'ENERGIA')                 THEN '25'
             WHEN (UPPER(conta_bd) = 'TRANSPORTE')              THEN '26'
             WHEN (UPPER(conta_bd) = 'DESPORTO E LAZER')        THEN '27'
             WHEN (UPPER(conta_bd) = 'ENCARGOS ESPECIAIS')      THEN '28'
             WHEN (UPPER(conta_bd) = 'RESERVA DE CONTINGÊNCIA') THEN '99'
        END AS funcao,
        SUM (valor) AS total_siconfi
      FROM `basedosdados.br_me_siconfi.municipio_despesas_funcao`
      WHERE sigla_uf in ('MG', 'PB', 'RS', 'SP', 'PR','CE', 'PE') AND estagio_bd = 'Despesas Empenhadas'
      GROUP BY 1,2,3,4
      ORDER BY 1
    )

    SELECT
      e.ano,
      e.sigla_uf,
      e.id_municipio,
      e.funcao,
      total_commitment,
      total_siconfi,
      ROUND((total_commitment - total_siconfi),2) AS variation
    FROM commitment e
    LEFT JOIN siconfi s ON e.ano=s.ano AND e.sigla_uf=s.sigla_uf AND e.id_municipio=s.id_municipio AND e.funcao=s.funcao
    WHERE total_siconfi IS NOT NULL
    ORDER BY variation DESC
    '''
    
    commitmet_siconfi_function = bd.read_sql(query2, billing_project_id=project_id_bq)
    # The 'proportion' variable shows the difference in the variation between commitment and siconfi over the total commitment
    commitmet_siconfi_function['proportion'] = 100*commitmet_siconfi_function['variation']/commitmet_siconfi_function['total_commitment']
    # Remove null percent values and restrict the RS data from 2010
    commitmet_siconfi_function = commitmet_siconfi_function.query('proportion != -inf')
    commitmet_siconfi_function = commitmet_siconfi_function.drop(commitmet_siconfi_function[(commitmet_siconfi_function['sigla_uf'] == 'RS') & (commitmet_siconfi_function['ano'] < 2010)].index)

    commitmet_siconfi_function.to_csv(os.path.join(input_path,'commitment_function_municipality_year.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Total of commitments by year and municipality
# The table show discrepancies from our database and Siconfi data in proportion to our data, e.g., if our data overestimate or underestimate
# the results in relation to National Treasury

commitment_siconfi = pd.read_csv(os.path.join(input_path, 'commitment_municipality_year.csv'))
commitment_siconfi['proportion'] = np.where((commitment_siconfi['proportion']) <= -25, -25, commitment_siconfi['proportion'])
commitment_siconfi['proportion'] = np.where((commitment_siconfi['proportion']) >= 25, 25, commitment_siconfi['proportion'])

graph = commitment_siconfi.copy()

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("CE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "CE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency', fontsize=10)
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("MG", fontsize = 14)
ax.hist(graph.query('sigla_uf == "MG"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("PB", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PB"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency', fontsize=10)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])


plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("PE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("PR", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PR"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency', fontsize=10)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("RS", fontsize = 14)
ax.hist(graph.query('sigla_uf == "RS"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data', fontsize=10)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 7)
ax = plt.gca()
plt.title("SP", fontsize = 14)
ax.hist(graph.query('sigla_uf == "SP"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency', fontsize=10)
plt.xlabel('% difference from SICONFI data', fontsize=10)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_commitment.pdf'), bbox_inches='tight')

In [None]:
# Total of commitments by year, municipality and function
# The table show discrepancies from our database and Siconfi data in proportion to our data, e.g., if our data overestimate or underestimate
# the results in relation to National Treasury

commitment_siconfi_function = pd.read_csv(os.path.join(input_path, 'commitment_function_municipality_year.csv'))
commitment_siconfi_function['proportion'] = np.where((commitment_siconfi_function['proportion']) <= -25, -25, commitment_siconfi_function['proportion'])
commitment_siconfi_function['proportion'] = np.where((commitment_siconfi_function['proportion']) >= 25, 25, commitment_siconfi_function['proportion'])

graph = commitment_siconfi_function.copy()

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("CE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "CE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("MG", fontsize = 14)
ax.hist(graph.query('sigla_uf == "MG"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("PB", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PB"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])


plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("PE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("PR", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PR"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("RS", fontsize = 14)
ax.hist(graph.query('sigla_uf == "RS"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 7)
ax = plt.gca()
plt.title("SP", fontsize = 14)
ax.hist(graph.query('sigla_uf == "SP"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_commitment_function.pdf'), bbox_inches='tight')

## Verification | Year - Municipality

In [None]:
# Total of verifications by year and municipality compared to SICONFI

def run_query_and_save_results():

    query3 = '''
    WITH verification AS (
      SELECT
        ano,
        id_municipio,
        SUM(valor_final) AS total_verification
      FROM basedosdados.world_wb_mides.liquidacao
      WHERE ano >= 2003
      GROUP BY 1,2
    ),
      siconfi AS (
      SELECT
      ano,
      sigla_uf,
      id_municipio,
      SUM(valor) AS total_siconfi
      FROM `basedosdados.br_me_siconfi.municipio_despesas_orcamentarias`
      WHERE sigla_uf IN ('CE', 'MG', 'PB', 'PE', 'PR', 'RS', 'SP') AND ano >= 2003 AND estagio_bd = 'Despesas Liquidadas' AND conta_bd = 'Despesas Orçamentárias'
      GROUP BY 1,2,3
      ORDER BY 1
    )

    SELECT
      e.ano,
      sigla_uf,
      e.id_municipio,
      total_verification,
      total_siconfi,
      ROUND(total_verification - total_siconfi,2) AS variation
    FROM verification e
    LEFT JOIN siconfi s ON e.ano=s.ano AND e.id_municipio=s.id_municipio
    WHERE total_siconfi IS NOT NULL
    '''

    verification_siconfi = bd.read_sql(query3, billing_project_id=project_id_bq)
    # The 'proportion' variable shows the difference in the variation between commitment and siconfi over the total commitment
    verification_siconfi['proportion'] = 100*verification_siconfi['variation']/verification_siconfi['total_verification']
    # Remove null percent values and restrict the RS data from 2010
    verification_siconfi = verification_siconfi.query('proportion != -inf')
    verification_siconfi = verification_siconfi.drop(verification_siconfi[(verification_siconfi['sigla_uf'] == 'RS') & (verification_siconfi['ano'] < 2010)].index)

    verification_siconfi.to_csv(os.path.join(input_path,'verification_municipality_year.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Total of verification by year, municipality and function compared to SICONFI

def run_query_and_save_results():

    query = '''
    WITH verification AS (
      SELECT
        id_empenho_bd,
        SUM(valor_final) AS total_verification
      FROM basedosdados.world_wb_mides.liquidacao
      GROUP BY 1
      HAVING id_empenho_bd IS NOT NULL
    ),
      commitment AS (
      SELECT
        ano,
        sigla_uf,
        id_municipio,
        id_empenho_bd,
        funcao,
        SUM (valor_final) AS total_commitment
      FROM basedosdados.world_wb_mides.empenho
      GROUP BY 1,2,3,4,5
      HAVING id_empenho_bd IS NOT NULL
    ),
      verification_function AS (
        SELECT
          ano,
          sigla_uf,
          id_municipio,
          funcao,
          SUM(p.total_verification) AS total_verification
        FROM commitment c
        LEFT JOIN verification p ON c.id_empenho_bd = p.id_empenho_bd
        GROUP BY 1,2,3,4
    ),
     siconfi AS (
      SELECT
        ano,
        sigla_uf,
        id_municipio,
        CASE WHEN (UPPER(conta_bd) = 'LEGISLATIVA')             THEN '1'
             WHEN (UPPER(conta_bd) = 'JUDICIÁRIA')              THEN '2'
             WHEN (UPPER(conta_bd) = 'ESSENCIAL À JUSTIÇA')     THEN '3'
             WHEN (UPPER(conta_bd) = 'ADMINISTRAÇÃO')           THEN '4'
             WHEN (UPPER(conta_bd) = 'DEFESA NACIONAL')         THEN '5'
             WHEN (UPPER(conta_bd) = 'SEGURANÇA PÚBLICA')       THEN '6'
             WHEN (UPPER(conta_bd) = 'RELAÇÕES EXTERIORES')     THEN '7'
             WHEN (UPPER(conta_bd) = 'ASSISTÊNCIA SOCIAL')      THEN '8'
             WHEN (UPPER(conta_bd) = 'PREVIDÊNCIA SOCIAL')      THEN '9'
             WHEN (UPPER(conta_bd) = 'SAÚDE')                   THEN '10'
             WHEN (UPPER(conta_bd) = 'TRABALHO')                THEN '11'
             WHEN (UPPER(conta_bd) = 'EDUCAÇÃO')                THEN '12'
             WHEN (UPPER(conta_bd) = 'CULTURA')                 THEN '13'
             WHEN (UPPER(conta_bd) = 'DIREITOS DA CIDADANIA')   THEN '14'
             WHEN (UPPER(conta_bd) = 'URBANISMO')               THEN '15'
             WHEN (UPPER(conta_bd) = 'HABITAÇÃO')               THEN '16'
             WHEN (UPPER(conta_bd) = 'SANEAMENTO')              THEN '17'
             WHEN (UPPER(conta_bd) = 'GESTÃO AMBIENTAL')        THEN '18'
             WHEN (UPPER(conta_bd) = 'CIÊNCIA E TECNOLOGIA')    THEN '19'
             WHEN (UPPER(conta_bd) = 'AGRICULTURA')             THEN '20'
             WHEN (UPPER(conta_bd) = 'ORGANIZAÇÃO AGRÁRIA')     THEN '21'
             WHEN (UPPER(conta_bd) = 'INDÚSTRIA')               THEN '22'
             WHEN (UPPER(conta_bd) = 'COMÉRCIO E SERVIÇOS')     THEN '23'
             WHEN (UPPER(conta_bd) = 'COMUNICAÇÕES')            THEN '24'
             WHEN (UPPER(conta_bd) = 'ENERGIA')                 THEN '25'
             WHEN (UPPER(conta_bd) = 'TRANSPORTE')              THEN '26'
             WHEN (UPPER(conta_bd) = 'DESPORTO E LAZER')        THEN '27'
             WHEN (UPPER(conta_bd) = 'ENCARGOS ESPECIAIS')      THEN '28'
             WHEN (UPPER(conta_bd) = 'RESERVA DE CONTINGÊNCIA') THEN '99'
        END AS funcao,
        SUM (valor) AS total_siconfi
      FROM `basedosdados.br_me_siconfi.municipio_despesas_funcao`
      WHERE ano >= 2003 AND sigla_uf in ('MG', 'PB', 'RS', 'SP', 'PR','CE', 'PE') AND estagio_bd = 'Despesas Liquidadas'
      GROUP BY 1,2,3,4
      ORDER BY 1
    )

    SELECT
      e.ano,
      e.sigla_uf,
      e.id_municipio,
      e.funcao,
      total_verification,
      total_siconfi,
      ROUND((total_verification - total_siconfi),2) AS variation
    FROM verification_function e
    LEFT JOIN siconfi s ON e.ano=s.ano AND e.sigla_uf=s.sigla_uf AND e.id_municipio=s.id_municipio AND e.funcao=s.funcao
    WHERE total_siconfi IS NOT NULL
    ORDER BY variation DESC
    '''

    verification_siconfi_function = bd.read_sql(query, billing_project_id=project_id_bq)

    # The 'proportion' variable shows the difference in the variation between commitment and siconfi over the total commitment
    verification_siconfi_function['proportion'] = 100*verification_siconfi_function['variation']/verification_siconfi_function['total_verification']
    verification_siconfi_function = verification_siconfi_function.query('proportion != -inf')
    
    verification_siconfi_function.to_csv(os.path.join(input_path,'verification_function_municipality_year.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Total of verifications by year, municipality
# The table show discrepancies from our database and Siconfi data in proportion to our data, e.g., if our data overestimate or underestimate
# the results in relation to National Treasury

verification_siconfi = pd.read_csv(os.path.join(input_path, 'verification_municipality_year.csv'))
verification_siconfi['proportion'] = np.where((verification_siconfi['proportion']) <= -25, -25, verification_siconfi['proportion'])
verification_siconfi['proportion'] = np.where((verification_siconfi['proportion']) >= 25, 25, verification_siconfi['proportion'])

graph = verification_siconfi.copy()

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("CE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "CE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("MG", fontsize = 14)
ax.hist(graph.query('sigla_uf == "MG"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("PB", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PB"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("PE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("PR", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PR"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("RS", fontsize = 14)
ax.hist(graph.query('sigla_uf == "RS"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 7)
ax = plt.gca()
plt.title("SP", fontsize = 14)
ax.hist(graph.query('sigla_uf == "SP"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_verification_v2.pdf'), bbox_inches='tight')

In [None]:
# Total of verification by year, municipality and function
# The table show discrepancies from our database and Siconfi data in proportion to our data, e.g., if our data overestimate or underestimate
# the results in relation to National Treasury.
# PE state not is included here because the data don't have a primary key (id_empenho_bd)

verification_siconfi_function = pd.read_csv(os.path.join(input_path, 'verification_function_municipality_year.csv'))
verification_siconfi_function['proportion'] = np.where((verification_siconfi_function['proportion']) <= -25, -25, verification_siconfi_function['proportion'])
verification_siconfi_function['proportion'] = np.where((verification_siconfi_function['proportion']) >= 25, 25, verification_siconfi_function['proportion'])
graph = verification_siconfi_function.copy()

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("CE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "CE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("MG", fontsize = 14)
ax.hist(graph.query('sigla_uf == "MG"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("PB", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PB"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("PR", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PR"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("RS", fontsize = 14)
ax.hist(graph.query('sigla_uf == "RS"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("SP", fontsize = 14)
ax.hist(graph.query('sigla_uf == "SP"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_verification_function.pdf'), bbox_inches='tight')

## Payment | Year - Municipality

In [None]:
# Total of payment by year and municipality compared to SICONFI

def run_query_and_save_results():
    
    query4 = '''
    WITH payment AS (
      SELECT
        ano,
        id_municipio,
        SUM(valor_final) AS total_payment,
        SUM(valor_liquido_recebido) AS total_net_payment,
      FROM basedosdados.world_wb_mides.pagamento
      WHERE ano >= 2003
      GROUP BY 1,2
    ),
      siconfi AS (
      SELECT
      ano,
      sigla_uf,
      id_municipio,
      SUM(valor) AS total_siconfi
      FROM `basedosdados.br_me_siconfi.municipio_despesas_orcamentarias`
      WHERE sigla_uf IN ('CE', 'MG', 'PB', 'PE', 'PR', 'RS', 'SP') AND ano >= 2003 AND estagio_bd = 'Despesas Pagas' AND conta_bd = 'Despesas Orçamentárias'
      GROUP BY 1,2,3
      ORDER BY 1
    )

    SELECT
      e.ano,
      sigla_uf,
      e.id_municipio,
      total_payment,
      total_net_payment,
      total_siconfi,
      ROUND(total_payment - total_siconfi,2) AS variation
    FROM payment e
    FULL OUTER JOIN siconfi s ON e.ano=s.ano AND e.id_municipio=s.id_municipio
    WHERE total_siconfi IS NOT NULL
    '''

    payment_siconfi = bd.read_sql(query4, billing_project_id=project_id_bq)
    # The 'proportion' variable shows the difference in the variation between commitment and siconfi over the total commitment
    payment_siconfi['proportion'] = 100*payment_siconfi['variation']/payment_siconfi['total_payment']
    # Remove null percent values and restrict the RS data from 2010
    payment_siconfi = payment_siconfi.query('proportion != -inf')
    payment_siconfi = payment_siconfi.drop(payment_siconfi[(payment_siconfi['sigla_uf'] == 'RS') & (payment_siconfi['ano'] < 2010)].index)

    payment_siconfi.to_csv(os.path.join(input_path,'payment_municipality_year.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Total of payment by year, municipality and function compared to SICONFI

def run_query_and_save_results():

    query = '''
    WITH payment AS (
      SELECT
        id_empenho_bd,
        SUM(valor_final) AS total_payment
      FROM basedosdados.world_wb_mides.pagamento
      GROUP BY 1
      HAVING id_empenho_bd IS NOT NULL
    ),
      commitment AS (
      SELECT
        ano,
        sigla_uf,
        id_municipio,
        id_empenho_bd,
        funcao,
        SUM (valor_final) AS total_commitment
      FROM basedosdados.world_wb_mides.empenho
      GROUP BY 1,2,3,4,5
      HAVING id_empenho_bd IS NOT NULL
    ),
      payment_function AS (
        SELECT
          ano,
          sigla_uf,
          id_municipio,
          funcao,
          SUM(p.total_payment) AS total_payment
        FROM commitment c
        LEFT JOIN payment p ON c.id_empenho_bd = p.id_empenho_bd
        GROUP BY 1,2,3,4
    ),
     siconfi AS (
      SELECT
        ano,
        sigla_uf,
        id_municipio,
        CASE WHEN (UPPER(conta_bd) = 'LEGISLATIVA')             THEN '1'
             WHEN (UPPER(conta_bd) = 'JUDICIÁRIA')              THEN '2'
             WHEN (UPPER(conta_bd) = 'ESSENCIAL À JUSTIÇA')     THEN '3'
             WHEN (UPPER(conta_bd) = 'ADMINISTRAÇÃO')           THEN '4'
             WHEN (UPPER(conta_bd) = 'DEFESA NACIONAL')         THEN '5'
             WHEN (UPPER(conta_bd) = 'SEGURANÇA PÚBLICA')       THEN '6'
             WHEN (UPPER(conta_bd) = 'RELAÇÕES EXTERIORES')     THEN '7'
             WHEN (UPPER(conta_bd) = 'ASSISTÊNCIA SOCIAL')      THEN '8'
             WHEN (UPPER(conta_bd) = 'PREVIDÊNCIA SOCIAL')      THEN '9'
             WHEN (UPPER(conta_bd) = 'SAÚDE')                   THEN '10'
             WHEN (UPPER(conta_bd) = 'TRABALHO')                THEN '11'
             WHEN (UPPER(conta_bd) = 'EDUCAÇÃO')                THEN '12'
             WHEN (UPPER(conta_bd) = 'CULTURA')                 THEN '13'
             WHEN (UPPER(conta_bd) = 'DIREITOS DA CIDADANIA')   THEN '14'
             WHEN (UPPER(conta_bd) = 'URBANISMO')               THEN '15'
             WHEN (UPPER(conta_bd) = 'HABITAÇÃO')               THEN '16'
             WHEN (UPPER(conta_bd) = 'SANEAMENTO')              THEN '17'
             WHEN (UPPER(conta_bd) = 'GESTÃO AMBIENTAL')        THEN '18'
             WHEN (UPPER(conta_bd) = 'CIÊNCIA E TECNOLOGIA')    THEN '19'
             WHEN (UPPER(conta_bd) = 'AGRICULTURA')             THEN '20'
             WHEN (UPPER(conta_bd) = 'ORGANIZAÇÃO AGRÁRIA')     THEN '21'
             WHEN (UPPER(conta_bd) = 'INDÚSTRIA')               THEN '22'
             WHEN (UPPER(conta_bd) = 'COMÉRCIO E SERVIÇOS')     THEN '23'
             WHEN (UPPER(conta_bd) = 'COMUNICAÇÕES')            THEN '24'
             WHEN (UPPER(conta_bd) = 'ENERGIA')                 THEN '25'
             WHEN (UPPER(conta_bd) = 'TRANSPORTE')              THEN '26'
             WHEN (UPPER(conta_bd) = 'DESPORTO E LAZER')        THEN '27'
             WHEN (UPPER(conta_bd) = 'ENCARGOS ESPECIAIS')      THEN '28'
             WHEN (UPPER(conta_bd) = 'RESERVA DE CONTINGÊNCIA') THEN '99'
        END AS funcao,
        SUM (valor) AS total_siconfi
      FROM `basedosdados.br_me_siconfi.municipio_despesas_funcao`
      WHERE ano >= 2003 AND sigla_uf in ('MG', 'PB', 'RS', 'SP', 'PR','CE', 'PE') AND estagio_bd = 'Despesas Pagas'
      GROUP BY 1,2,3,4
      ORDER BY 1
    )

    SELECT
      e.ano,
      e.sigla_uf,
      e.id_municipio,
      e.funcao,
      total_payment,
      total_siconfi,
      ROUND((total_payment - total_siconfi),2) AS variation
    FROM payment_function e
    LEFT JOIN siconfi s ON e.ano=s.ano AND e.sigla_uf=s.sigla_uf AND e.id_municipio=s.id_municipio AND e.funcao=s.funcao
    WHERE total_siconfi IS NOT NULL
    ORDER BY variation DESC
    '''

    payment_siconfi_function = bd.read_sql(query, billing_project_id=project_id_bq)
    # The 'proportion' variable shows the difference in the variation between commitment and siconfi over the total commitment
    payment_siconfi_function['proportion'] = 100*payment_siconfi_funciton['variation']/payment_siconfi_funciton['total_payment']
    payment_siconfi_function = payment_siconfi_funciton.query('proportion != -inf')

    payment_siconfi_function.to_csv(os.path.join(input_path,'payment_function_municipality_year.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Total of payments by year, municipality
# The table show discrepancies from our database and Siconfi data in proportion to our data, e.g., if our data overestimate or underestimate
# the results in relation to National Treasury

payment_siconfi = pd.read_csv(os.path.join(input_path, 'payment_municipality_year.csv'))
payment_siconfi['proportion'] = np.where((payment_siconfi['proportion']) <= -25, -25, payment_siconfi['proportion'])
payment_siconfi['proportion'] = np.where((payment_siconfi['proportion']) >= 25, 25, payment_siconfi['proportion'])
graph = payment_siconfi.copy()

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("CE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "CE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("MG", fontsize = 14)
ax.hist(graph.query('sigla_uf == "MG"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("PB", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PB"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])


plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("PE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("PR", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PR"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("RS", fontsize = 14)
ax.hist(graph.query('sigla_uf == "RS"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 7)
ax = plt.gca()
plt.title("SP", fontsize = 14)
ax.hist(graph.query('sigla_uf == "SP"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_payment_v2.pdf'), bbox_inches='tight')

In [None]:
# Total of payments by year, municipality and function
# The table show discrepancies from our database and Siconfi data in proportion to our data, e.g., if our data overestimate or underestimate
# the results in relation to National Treasury.
# PE state not is included here because the data don't have a primary key (id_empenho_bd)

payment_siconfi_function = pd.read_csv(os.path.join(input_path, 'payment_function_municipality_year.csv'))
payment_siconfi_function['proportion'] = np.where((payment_siconfi_function['proportion']) <= -25, -25, payment_siconfi_function['proportion'])
payment_siconfi_function['proportion'] = np.where((payment_siconfi_function['proportion']) >= 25, 25, payment_siconfi_function['proportion'])
graph = payment_siconfi_function.copy()


plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("CE", fontsize = 14)
ax.hist(graph.query('sigla_uf == "CE"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("MG", fontsize = 14)
ax.hist(graph.query('sigla_uf == "MG"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("PB", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PB"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("PR", fontsize = 14)
ax.hist(graph.query('sigla_uf == "PR"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("RS", fontsize = 14)
ax.hist(graph.query('sigla_uf == "RS"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("SP", fontsize = 14)
ax.hist(graph.query('sigla_uf == "SP"')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data')
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_payment_function.pdf'), bbox_inches='tight')

In [None]:
# The case of Minas Gerais and Paraná
# Besides the our data is overestimated in relation to Siconfi data, we show that the behaviour is the similar between years

# Paraná (PR)
payment_siconfi = pd.read_csv(os.path.join(input_path, 'payment_municipality_year.csv'))
graph = payment_siconfi.query('sigla_uf == "PR" & proportion > -25 & proportion < 25')
graph['ano'] = graph['ano'].astype(int)

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("2013", fontsize = 14)
plt.hist(graph.query('ano == 2013')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("2014", fontsize = 14)
plt.hist(graph.query('ano == 2014')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("2015", fontsize = 14)
plt.hist(graph.query('ano == 2015')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("2016", fontsize = 14)
plt.hist(graph.query('ano == 2016')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("2017", fontsize = 14)
plt.hist(graph.query('ano == 2017')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("2018", fontsize = 14)
plt.hist(graph.query('ano == 2018')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 7)
ax = plt.gca()
plt.title("2019", fontsize = 14)
plt.hist(graph.query('ano == 2019')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data - PR')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 8)
ax = plt.gca()
plt.title("2020", fontsize = 14)
plt.hist(graph.query('ano == 2020')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data - PR')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_payment_pr.pdf'), bbox_inches='tight')

In [None]:
graph = payment_siconfi.query('sigla_uf == "MG" & proportion > -25 & proportion < 25')
graph['ano'] = graph['ano'].astype(int)

plt.figure(figsize = ((15, 9)))

plt.subplot(4, 2, 1)
ax = plt.gca()
plt.title("2014", fontsize = 14)
plt.hist(graph.query('ano == 2014')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 2)
ax = plt.gca()
plt.title("2015", fontsize = 14)
plt.hist(graph.query('ano == 2015')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 3)
ax = plt.gca()
plt.title("2016", fontsize = 14)
plt.hist(graph.query('ano == 2016')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 4)
ax = plt.gca()
plt.title("2017", fontsize = 14)
plt.hist(graph.query('ano == 2017')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xticks([-5, 0, 5, 10, 15, 20, 25])

plt.subplot(4, 2, 5)
ax = plt.gca()
plt.title("2018", fontsize = 14)
plt.hist(graph.query('ano == 2018')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 6)
ax = plt.gca()
plt.title("2019", fontsize = 14)
plt.hist(graph.query('ano == 2019')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 7)
ax = plt.gca()
plt.title("2020", fontsize = 14)
plt.hist(graph.query('ano == 2020')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.ylabel('Frequency')
plt.xlabel('% difference from SICONFI data - MG')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])

plt.subplot(4, 2, 8)
ax = plt.gca()
plt.title("2021", fontsize = 14)
plt.hist(graph.query('ano == 2021')['proportion'],
         bins=20, facecolor='#1a476f', edgecolor='#0d3446', linewidth=0.5)
plt.xlabel('% difference from SICONFI data - MG')
lista_labels = ['-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25']
ax.set_xticks([int(label) for label in lista_labels])
ax.set_xticklabels(['<-25', '-20', '-15', '-10', '-5', '0', '5', '10', '15', '20', '25>'])


plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
plt.savefig(os.path.join(fig_output, 'validation_siconfi_payment_mg.pdf'), bbox_inches='tight')