In [None]:
# Directory 
input_path = os.environ.get('input_path')
fig_output = os.environ.get('fig_output')
tab_output = os.environ.get('tab_output')

# Queries config
project_id_bq = os.environ.get('project_id_bq')
run_query = os.environ.get('run_query')

In [None]:
# Merge commitment and verification
# Necessary to get the percentage of unique commitments that are both in commitment (empenho) and verification (liquidacao)
# We use it to construct the variable 'Has verification information'

def run_query_and_save_results():

    query = '''
    SELECT
        e.sigla_uf,
        COUNT(DISTINCT e.id_empenho_bd) as distinct_commitments,
        COUNT(DISTINCT e.id_municipio) as number_municipalities,
        COUNT(DISTINCT CASE WHEN SUBSTR(e.elemento_despesa, 5, 2) IN ('30', '32', '52') THEN e.id_empenho_bd END) as procurement_commitments,
        COUNT(DISTINCT CASE WHEN l.id_empenho_bd IS NOT NULL THEN l.id_empenho_bd END) / COUNT(DISTINCT e.id_empenho_bd) as has_verification_information
    FROM (
        SELECT *
        FROM basedosdados.world_wb_mides.empenho
        WHERE id_empenho_bd IS NOT NULL
    ) e
    LEFT JOIN (
        SELECT *
        FROM basedosdados.world_wb_mides.liquidacao
        WHERE id_empenho_bd IS NOT NULL
    ) l
    ON e.id_empenho_bd = l.id_empenho_bd
    WHERE (e.sigla_uf <> 'RS' OR (e.sigla_uf = 'RS' AND e.ano > 2009))
    GROUP BY e.sigla_uf

    '''
    empenho_liquidacao = bd.read_sql(query, billing_project_id=project_id_bq)

    empenho_liquidacao.to_csv(os.path.join(input_path,'empenho_liquidacao.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Merge commitment and payment
# Necessary to get the percentage of unique commitments that are both in commitment (empenho) and payment (pagamento)
# We use it to construct the variable 'Has payment information'

def run_query_and_save_results():

    query = '''
    SELECT
        e.sigla_uf,
        COUNT(DISTINCT CASE WHEN p.id_empenho_bd IS NOT NULL THEN p.id_empenho_bd END) / COUNT(DISTINCT e.id_empenho_bd) as has_payment_information
    FROM (
        SELECT *
        FROM basedosdados.world_wb_mides.empenho
        WHERE id_empenho_bd IS NOT NULL
    ) e
    LEFT JOIN (
        SELECT *
        FROM basedosdados.world_wb_mides.pagamento
        WHERE id_empenho_bd IS NOT NULL
    ) p
    ON e.id_empenho_bd = p.id_empenho_bd
    WHERE (e.sigla_uf <> 'RS' OR (e.sigla_uf = 'RS' AND e.ano > 2009))
    GROUP BY e.sigla_uf

    '''
    empenho_pagamento = bd.read_sql(query, billing_project_id=project_id_bq)

    empenho_pagamento.to_csv(os.path.join(input_path,'empenho_pagamento.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Count commitment observations

def run_query_and_save_results():

    query = '''
    SELECT
        sigla_uf,
        COUNT(*) as obs_commitments,
        SUM(CASE WHEN valor_final > 0 AND id_empenho_bd IS NOT NULL THEN 1 ELSE 0 END) as total_positive_values,
    FROM
      basedosdados.world_wb_mides.empenho
    WHERE (sigla_uf <> 'RS' OR (sigla_uf = 'RS' AND ano > 2009))
    GROUP BY sigla_uf

    '''
    empenho = bd.read_sql(query, billing_project_id=project_id_bq)

    empenho.to_csv(os.path.join(input_path,'empenho.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Count number of distinct verifications

def run_query_and_save_results():

    query = '''
    SELECT
        sigla_uf,
        COUNT(*) as obs_verifications,
        COUNT(DISTINCT CASE WHEN id_liquidacao_bd IS NOT NULL THEN id_liquidacao_bd END) as distinct_verifications,
    FROM
      basedosdados.world_wb_mides.liquidacao
    WHERE (sigla_uf <> 'RS' OR (sigla_uf = 'RS' AND ano > 2009))
    GROUP BY sigla_uf

    '''
    liquidacao = bd.read_sql(query, billing_project_id=project_id_bq)

    liquidacao.to_csv(os.path.join(input_path,'liquidacao.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Count number of distinct payments and distinct sellers

def run_query_and_save_results():

    query = '''
    SELECT
        sigla_uf,
        COUNT(*) as obs_payments,
        COUNT(DISTINCT CASE WHEN id_pagamento_bd IS NOT NULL THEN id_pagamento_bd END) as distinct_payments,
        COUNT(DISTINCT CONCAT(documento_credor,nome_credor)) AS distinct_sellers,
    FROM
      basedosdados.world_wb_mides.pagamento
    WHERE (sigla_uf <> 'RS' OR (sigla_uf = 'RS' AND ano > 2009))
    GROUP BY sigla_uf

    '''
    pagamento = bd.read_sql(query, billing_project_id=project_id_bq)

    pagamento.to_csv(os.path.join(input_path,'pagamento.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Count total ammount per state and year

def run_query_and_save_results():

    query = '''
    SELECT
        ano, sigla_uf,
        SAFE_CAST(SUM(valor_final) / 1e9 AS FLOAT64) AS total_payment_billion,
    FROM
      basedosdados.world_wb_mides.pagamento
    WHERE (sigla_uf <> 'RS' OR (sigla_uf = 'RS' AND ano > 2009))
    GROUP BY sigla_uf, ano

    '''
    total_pagamento_ano = bd.read_sql(query, billing_project_id=project_id_bq)

    total_pagamento_ano.to_csv(os.path.join(input_path,'total_pagamento_ano.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# query for PE

def run_query_and_save_results():

    query = '''
    SELECT
        sigla_uf,
        COUNT(*) as obs_commitments,
        SUM(CASE WHEN valor_final > 0 THEN 1 ELSE 0 END) as total_positive_values,
        COUNT(DISTINCT id_municipio) as number_municipalities,
        COUNT(CASE WHEN SUBSTR(elemento_despesa, 5, 2) IN ('30', '32', '52') THEN id_empenho END) as procurement_commitments
    FROM basedosdados.world_wb_mides.empenho
    WHERE sigla_uf = 'PE'
    GROUP BY sigla_uf

    '''
    empenho_pe = bd.read_sql(query, billing_project_id=project_id_bq)

    empenho_pe.to_csv(os.path.join(input_path,'empenho_pe.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# query for PE

def run_query_and_save_results():

    query = '''

    SELECT
        l.sigla_uf,
        l.obs_verifications,
        p.obs_payments
    FROM (
        SELECT sigla_uf, COUNT(*) AS obs_verifications
        FROM basedosdados.world_wb_mides.liquidacao
        WHERE sigla_uf = 'PE'
        GROUP BY sigla_uf
    ) l
    LEFT JOIN (
        SELECT sigla_uf, COUNT(*) AS obs_payments
        FROM basedosdados.world_wb_mides.pagamento
        WHERE sigla_uf = 'PE'
        GROUP BY sigla_uf
    ) p
    ON l.sigla_uf = p.sigla_uf

    '''
    liq_pag_pe = bd.read_sql(query, billing_project_id=project_id_bq)

    liq_pag_pe.to_csv(os.path.join(input_path,'liq_pag_pe.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# IPCA

def run_query_and_save_results():

    query = '''
    SELECT
        variacao_anual, ano, mes
    FROM
      basedosdados.br_ibge_ipca.mes_brasil
    WHERE mes = 12 AND ano > 2000
    ORDER BY ano
    '''
    ipca_anual = bd.read_sql(query, billing_project_id=project_id_bq)

    ipca_anual.to_csv(os.path.join(input_path,'ipca_anual.csv'), index=False, na_rep='', float_format='%.2f')

if __name__ == '__main__':
    if run_query == 'True':
        run_query_and_save_results()

In [None]:
# Descriptive statistics table

empenho_liquidacao = pd.read_csv(os.path.join(input_path,'empenho_liquidacao.csv'))
empenho_pagamento = pd.read_csv(os.path.join(input_path,'empenho_pagamento.csv'))
empenho = pd.read_csv(os.path.join(input_path,'empenho.csv'))
liquidacao = pd.read_csv(os.path.join(input_path,'liquidacao.csv'))
pagamento = pd.read_csv(os.path.join(input_path,'pagamento.csv'))
total_pagamento_ano = pd.read_csv(os.path.join(input_path,'total_pagamento_ano.csv'))
empenho_pe = pd.read_csv(os.path.join(input_path,'empenho_pe.csv'))
liq_pag_pe = pd.read_csv(os.path.join(input_path,'liq_pag_pe.csv'))
ipca_anual = pd.read_csv(os.path.join(input_path,'ipca_anual.csv'))

# 1. Deflate payment

# filter years
total_pagamento_ano = total_pagamento_ano[total_pagamento_ano['ano']<2022]

# create deflator

deflatores = [ipca_anual['variacao_anual'][0]+100]

for i in range(1, 21):
   deflator = (1+(ipca_anual['variacao_anual'][i]/100))*deflatores[i -1]
   deflatores.append(deflator)

deflator_2021 = deflatores[20]

deflatores = pd.DataFrame(deflatores,columns=['deflator'])
deflatores['ano'] = ipca_anual['ano']

total_pagamento_ano = pd.merge(total_pagamento_ano, deflatores, how='left', left_on='ano', right_on='ano')
total_pagamento_ano['total_payment_billion_real'] = deflator_2021*total_pagamento_ano['total_payment_billion']/total_pagamento_ano['deflator']

total_pagamento = total_pagamento_ano.groupby('sigla_uf')['total_payment_billion_real'].sum()

# 2. Join all variables

tabela = pd.merge(empenho_liquidacao, empenho_pagamento, how='left', left_on='sigla_uf', right_on='sigla_uf')
tabela = pd.merge(tabela, empenho, how='left', left_on='sigla_uf', right_on='sigla_uf')
tabela = pd.merge(tabela, pagamento, how='left', left_on='sigla_uf', right_on='sigla_uf')
tabela = pd.merge(tabela, liquidacao, how='left', left_on='sigla_uf', right_on='sigla_uf')

stats_pe = pd.merge(empenho_pe, liq_pag_pe, how='left', left_on = 'sigla_uf', right_on = 'sigla_uf')
tabela = pd.concat([tabela,stats_pe],axis=0)

tabela = pd.merge(tabela, total_pagamento, how='left', left_on='sigla_uf', right_on='sigla_uf')

tabela['share_procurement'] = np.where(tabela['sigla_uf']=="PE",  
                                       tabela['procurement_commitments']/tabela['obs_commitments'],
                                       tabela['procurement_commitments']/tabela['distinct_commitments'])

tabela['perc_positive_commitments'] = np.where(tabela['sigla_uf']=="PE", 
                                               tabela['total_positive_values']/tabela['obs_commitments'],
                                               tabela['total_positive_values']/tabela['distinct_commitments'])

tabela_rename = {'distinct_commitments':'Distinct commitments','share_procurement':'Related to procurement',
                 'has_verification_information':'Has verification information', 
                 'has_payment_information':'Has payment information',
                 'distinct_verifications':'Distinct verifications', 'distinct_payments':'Distinct payments',
                 'total_payment_billion_real':'Total amount of payments', 'distinct_sellers':'Number of distinct sellers',
                 'number_municipalities':'Number of distinct municipalities','perc_positive_commitments':'Greater than zero'}

tabela.rename(tabela_rename, axis=1, inplace=True)

# 3. Totals

# stats with information for PE

totais = tabela['procurement_commitments'].astype(int).sum()
df_totais = pd.DataFrame({'procurement_commitments': [totais]})

df_totais['obs_commitments'] = tabela['obs_commitments'].astype(int).sum()
df_totais['obs_verifications'] = tabela['obs_verifications'].astype(int).sum()
df_totais['obs_payments'] = tabela['obs_payments'].astype(int).sum()

df_totais['Number of distinct municipalities'] = tabela['Number of distinct municipalities'].astype(int).sum()

df_totais['Total amount of payments'] = tabela['Total amount of payments'].astype(float).sum()

# Supose that Distinct commitments and obs_commitments are the same in PE, just to calculate some statistics

tabela['Distinct commitments'] = np.where(tabela['sigla_uf']=="PE", tabela['obs_commitments'], tabela['Distinct commitments'])

df_totais['Related to procurement'] = df_totais['procurement_commitments']/(tabela['Distinct commitments'].astype(int).sum())
df_totais['Greater than zero'] = ((tabela['Greater than zero'].astype(float)*tabela['Distinct commitments'].astype(int)).sum())/(tabela['Distinct commitments'].astype(int).sum())

tabela['Distinct commitments'] = np.where(tabela['sigla_uf']=="PE", np.nan,tabela['Distinct commitments'])

# stats without information for PE

tabela_sample = tabela[tabela['sigla_uf']!="PE"]

df_totais['Distinct commitments'] = tabela_sample['Distinct commitments'].astype(int).sum()

df_totais['Has verification information'] = ((tabela_sample['Has verification information'].astype(float)*tabela_sample['Distinct commitments'].astype(int)).sum())/(tabela_sample['Distinct commitments'].astype(int).sum())
df_totais['Has payment information'] = ((tabela_sample['Has payment information'].astype(float)*tabela_sample['Distinct commitments'].astype(int)).sum())/(tabela_sample['Distinct commitments'].astype(int).sum())

df_totais['Distinct verifications'] = tabela_sample['Distinct verifications'].astype(int).sum()

df_totais['Distinct payments'] = tabela_sample['Distinct payments'].astype(int).sum()
df_totais['Number of distinct sellers'] = tabela_sample['Number of distinct sellers'].astype(int).sum()
df_totais['Number of distinct municipalities'] = tabela_sample['Number of distinct municipalities'].astype(int).sum()

tabela = pd.concat([tabela,df_totais],axis=0)
tabela['sigla_uf'] = tabela['sigla_uf'].replace(np.nan,'Total')

# Rename - as in the paper

tabela.rename({'obs_commitments':'Observations commitment',
               'Related to procurement':'Related to procurement (%)',
               'Greater than zero':'Greater than zero (%)',
               'Has verification information':'Has verification information (%)',
               'Has payment information':'Has payment information (%)',
               'obs_verifications':'Observations verification',
               'obs_payments':'Observations payment'},
                axis=1, inplace=True)

# Format
variables_list = ['Related to procurement (%)','Greater than zero (%)',
                  'Has verification information (%)','Has payment information (%)']

for var in variables_list:
    tabela[var] = tabela[var] * 100

tabela = tabela.reset_index()
tabela.drop('index',axis=1,inplace=True)

tabela = tabela.sort_values('sigla_uf')

tabela.to_csv(os.path.join(tab_output,'descriptive_statistics_execution.csv'), index=False, na_rep='', float_format='%.1f')

tabela_pivo = tabela.transpose()
tabela_pivo.columns = tabela_pivo.iloc[0]
tabela_pivo = tabela_pivo[1:]
tabela_pivo = tabela_pivo.rename_axis(None, axis=1)

ordem = ['Observations commitment','Distinct commitments','Related to procurement (%)','Greater than zero (%)',
         'Has verification information (%)','Has payment information (%)','Observations verification',
         'Distinct verifications','Observations payment','Distinct payments','Total amount of payments',
         'Number of distinct sellers','Number of distinct municipalities']

tabela_pivo = tabela_pivo.reindex(ordem)

# No information for verification 
tabela_pivo.loc['Distinct verifications', ['CE', 'PB', 'PE']] = None

tabela_pivo = tabela_pivo.replace(np.nan, None)

# Export TeX table
with open(os.path.join(tab_output, 'descriptive_statistics_execution.tex'), 'w') as f:
    f.write(tabulate(tabela_pivo, headers='keys', tablefmt='latex_booktabs', floatfmt=(".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f"),  missingval='-', stralign="lrrrrrr"))