In [1]:
suppressMessages( library(dplyr) ) # Manipulações de funções
suppressMessages( library(mvtnorm) ) # Manipulações de funções
suppressMessages( library(tidyr) ) # Manipulações de funções

suppressMessages( library(arrow) ) # parquet
suppressMessages( library(snow) ) # parallel apply, sapply, lapply
suppressMessages( library(readxl) ) # parallel apply, sapply, lapply

source("MERT_functions.R")

# Regression trees
suppressMessages( library(rpart) )

# Pacotes para a construção de gráficos
suppressMessages( library(ggplot2) )
suppressMessages( library(cowplot) )
suppressMessages( library(latex2exp) )
suppressMessages( library(GGally) )

# Configurações padrão para os gráficos do ggplot2 no notebook
t = theme(plot.title = element_text(size=26, hjust=0.5),
          axis.title = element_text(size=20),
          axis.text = element_text(size=16),
          legend.title = element_text(size = 20),
          legend.text = element_text(size = 16),
          plot.subtitle = element_text(size = 18, face="bold"))
theme_set(theme_minimal()+t)

In [2]:
df <- read_parquet("Data/quadrimestral_data.parquet")
df_rcl <- read_excel("Data/rcl_municipios_deflated.xlsx")
info_municipios <- read_excel("Data/info_municipios.xlsx")

df_rcl <- df_rcl %>% select(ds_municipio, ano_exercicio, RCL_defl)
info_municipios <- info_municipios %>% select(ds_municipio, atividade_principal, pib_nominal, populacao,
                                              quantidade_total_vagas, vereadores, area, alunos_escola)

head(df, 3)
head(df_rcl,3)
head(info_municipios,3)

ds_municipio,ds_orgao,cod_despesa,ano_exercicio,quad,ds_despesa,tp_orgao,ds_funcao_governo,ds_subfuncao_governo,ds_fonte_recurso,ds_modalidade_lic,vl_despesa,vl_despesa_defl
<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
adamantina,camara municipal de adamantina,101,2011,1,proventos - pessoal civil,camara municipal,legislativa,acao legislativa,tesouro,dispensa de licitacao,19768.56,38793.79
adamantina,camara municipal de adamantina,301,2011,1,pessoal civil,camara municipal,legislativa,acao legislativa,tesouro,dispensa de licitacao,22734.9,44614.93
adamantina,camara municipal de adamantina,1101,2011,1,vencimentos e salarios,camara municipal,legislativa,acao legislativa,tesouro,dispensa de licitacao,72753.62,142771.59


ds_municipio,ano_exercicio,RCL_defl
<chr>,<dbl>,<dbl>
adamantina,2010,143485107
adolfo,2010,24941031
aguai,2010,95722129


ds_municipio,atividade_principal,pib_nominal,populacao,quantidade_total_vagas,vereadores,area,alunos_escola
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
adamantina,demais servicos,1054541,35023,2002,9,411,2359
adolfo,"agricultura, inclusive apoio a agricultura e a pos colheita",99734,3571,458,9,211,478
aguai,demais servicos,942708,35954,1471,13,474,3489


In [3]:
df <- merge(df, info_municipios, by = "ds_municipio")
df <- merge(df, df_rcl, by = c("ds_municipio", "ano_exercicio"))

head(df)

Unnamed: 0_level_0,ds_municipio,ano_exercicio,ds_orgao,cod_despesa,quad,ds_despesa,tp_orgao,ds_funcao_governo,ds_subfuncao_governo,ds_fonte_recurso,⋯,vl_despesa,vl_despesa_defl,atividade_principal,pib_nominal,populacao,quantidade_total_vagas,vereadores,area,alunos_escola,RCL_defl
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,adamantina,2011,camara municipal de adamantina,3940,2,programa de alimentacao do trabalhador,camara municipal,legislativa,acao legislativa,tesouro,⋯,7290.0,14142.599,demais servicos,1054541,35023,2002,9,411,2359,154390907
2,adamantina,2011,centro universitario de adamantina,101,3,proventos - pessoal civil,centro universitario,previdencia social,previdencia complementar,recursos proprios da administracao indireta,⋯,64618.43,122907.966,demais servicos,1054541,35023,2002,9,411,2359,154390907
3,adamantina,2011,camara municipal de adamantina,3940,1,programa de alimentacao do trabalhador,camara municipal,legislativa,acao legislativa,tesouro,⋯,6210.0,12186.495,demais servicos,1054541,35023,2002,9,411,2359,154390907
4,adamantina,2011,centro universitario de adamantina,3980,2,hospedagens,centro universitario,educacao,ensino superior,recursos proprios da administracao indireta,⋯,1291.87,2506.228,demais servicos,1054541,35023,2002,9,411,2359,154390907
5,adamantina,2011,camara municipal de adamantina,3911,2,locacao de softwares,camara municipal,legislativa,acao legislativa,tesouro,⋯,6600.0,12803.999,demais servicos,1054541,35023,2002,9,411,2359,154390907
6,adamantina,2011,centro universitario de adamantina,3941,2,fornecimento de alimentacao,centro universitario,educacao,ensino superior,recursos proprios da administracao indireta,⋯,1750.0,3395.0,demais servicos,1054541,35023,2002,9,411,2359,154390907


## Modeling

For the modeling step, let's consider a subset of all the variables above. Also, we shall split the dataset into train and test components. For this to fit the predictive nature of this problem in time, the training set will be formed by observations up to 2017 and the test from 2018 up to 2021.

In [4]:
df_model <- df %>% select(ds_municipio, ds_orgao, cod_despesa, ano_exercicio, quad, tp_orgao, ds_funcao_governo, ds_fonte_recurso,
                          RCL_defl, pib_nominal, populacao, quantidade_total_vagas, vereadores, area,
                          vl_despesa_defl)
head(df_model, 4)

Unnamed: 0_level_0,ds_municipio,ds_orgao,cod_despesa,ano_exercicio,quad,tp_orgao,ds_funcao_governo,ds_fonte_recurso,RCL_defl,pib_nominal,populacao,quantidade_total_vagas,vereadores,area,vl_despesa_defl
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,adamantina,camara municipal de adamantina,3940,2011,2,camara municipal,legislativa,tesouro,154390907,1054541,35023,2002,9,411,14142.599
2,adamantina,centro universitario de adamantina,101,2011,3,centro universitario,previdencia social,recursos proprios da administracao indireta,154390907,1054541,35023,2002,9,411,122907.966
3,adamantina,camara municipal de adamantina,3940,2011,1,camara municipal,legislativa,tesouro,154390907,1054541,35023,2002,9,411,12186.495
4,adamantina,centro universitario de adamantina,3980,2011,2,centro universitario,educacao,recursos proprios da administracao indireta,154390907,1054541,35023,2002,9,411,2506.228


As it can be seen, the triple (ds_municipio, ds_orgao, cod_despesa) forms a new individual (associated to each triple there is a time series of expenses). Even though it would be possible to group all the triples into a single index variable, it seems more reasonable to consider at least one of them as covariates for the model, as they are nested covariates. The ideal approach to this would be something similar to the structure given by Galecki (2013) (page 248), in which there are N first-level groups (indexed by $i = 1, \dots, N$), each with $n_i$ second-level sub-groups (indexed by $j = 1, \dots, n_i$), which contain $n_{ij}$ observations each. A linear model for this example could be given by

\begin{equation*}
    Y_{ij} = X_{ij} \beta + Z_{1,ij} b_i + Z_{2,ij} b_{ij} + \varepsilon_{ij},
\end{equation*}

given that $b_i \sim N_{q_1}(0, D_1)$, $b_{ij} \sim N_{q_2}(0, D_2)$ and $\varepsilon \sim N_{n_{ij}}(0, R_{ij})$.

Note that in this case, since we have triples, it would be necessary another layer to this complex structure! For now this is not the most viable option. So in order to simplify this problem, let's at first consider that each individual is given by a triple but that the city is also a covariate for all of them. That way, even though essentially we suppose that different triples are independent from each other (clearly not the case), we insert a new covariate to account for part of the information loss associated with that hypothesis (at first I don't see a reason for it to be worse from the modelling perspective, as the information would not be lost wif we considered the variables in this way).


In [None]:
dates <- as.Date(paste(
    df_model$ano_exercicio,
    ifelse(df_model$quad == 1, 4, ifelse(df_model$quad == 2, 8, 12)),
    15, sep = "-"), "%Y-%m-%d")

df_model$aprox_date <- dates
head(df_model, 2)

In [6]:
df_model <- df_model %>% group_by(ds_municipio, ds_orgao, cod_despesa) %>% mutate(id = cur_group_id()) %>% select(
    id, ds_municipio, ds_orgao, cod_despesa, ano_exercicio, quad, tp_orgao, ds_funcao_governo, ds_fonte_recurso,
    RCL_defl, pib_nominal, populacao, quantidade_total_vagas, vereadores, area, vl_despesa_defl, aprox_date
) %>% arrange(id)
head(df_model, 4)

id,ds_municipio,ds_orgao,cod_despesa,ano_exercicio,quad,tp_orgao,ds_funcao_governo,ds_fonte_recurso,RCL_defl,pib_nominal,populacao,quantidade_total_vagas,vereadores,area,vl_despesa_defl,aprox_date
<int>,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>
1,adamantina,camara municipal de adamantina,101,2011,3,camara municipal,legislativa,tesouro,154390907,1054541,35023,2002,9,411,39612.48,2011-12-15
1,adamantina,camara municipal de adamantina,101,2011,1,camara municipal,legislativa,tesouro,154390907,1054541,35023,2002,9,411,38793.79,2011-04-15
1,adamantina,camara municipal de adamantina,101,2011,2,camara municipal,legislativa,tesouro,154390907,1054541,35023,2002,9,411,35685.52,2011-08-15
1,adamantina,camara municipal de adamantina,101,2012,3,camara municipal,legislativa,tesouro,152762472,1054541,35023,2002,9,411,43139.99,2012-12-15
