In [160]:
# Define a vector of package names to be used in the script
package_names_vec <- c(
  "tidyverse",
  "readxl",
  "srvyr",
  "survey",
  "broom"
)

# Define a function to install and load required packages
package_prep <- function(names_vec) {
  # Iterate through each package name in the provided vector
  for (name in names_vec) {
    # Check if the package is already installed
    if (!require(name, character.only = TRUE)) {
      # Install the package using renv if not already installed
      install.packages(name)
      library(name,character.only = TRUE)
    }
  }
}

# Pipe the vector of package names into the package_prep function
package_names_vec |> package_prep()





if (grepl("kaggle", getwd())) {
  paths <- list(
    data = file.path("/kaggle", "input", "cleaned-gfi","output"),
    output = file.path("/kaggle", "working", "output")
  )
} else {
  paths <- list(
    input = file.path("..", "output"),
    output = file.path("..", "figures")
  )
}

if (!dir.exists(paths$input)) {
  stop("INPUT DATA NOT FOUND\n DO NOT RUN THIS CODE.")
}

# 1) capture your current defaults
.default_repr_opts <- list(
  width     = getOption("repr.plot.width"),
  height    = getOption("repr.plot.height"),
  base_size = theme_get()$text$size %||% 11  # fallback to 11 if NULL
)

fig <- function(plot_obj,
                width,
                height,
                dpii,
                filename = "untitled",
                ...) {
  # 2) set the repr device size
  options(repr.plot.width  = width,
          repr.plot.height = height)
  
  # 3) compute a scale factor based on area ratio
  default_area <- .default_repr_opts$width * .default_repr_opts$height
  new_area     <- width * height
  scale_factor <- sqrt(new_area / default_area)*1.25
  
  # 4) adjust ggplot text sizes if it’s a ggplot object
  if (inherits(plot_obj, "ggplot")) {
    new_base <- .default_repr_opts$base_size * scale_factor
    plot_obj <- plot_obj +
      theme(
        text       = element_text(size = new_base),
        axis.title = element_text(size = new_base),
        axis.text  = element_text(size = new_base * 0.8),
        legend.text = element_text(size = new_base * 0.8),
        plot.title  = element_text(size = new_base * 1.1, face = "bold")
      )
  }
  
  # 5) print in‐notebook
  print(plot_obj)
  
  # 6) save to disk
  full <- file.path(paths$output, paste0(filename,".png"))
  dir.create(dirname(full), recursive = TRUE, showWarnings = FALSE)
  ggsave(filename = full,
         plot     = plot_obj,
         width    = width,
         height   = height,
         dpi      = dpii,
         ...)
  message("Saved plot to: ", normalizePath(full))
  
  # 7) restore your repr settings
  options(repr.plot.width  = .default_repr_opts$width,
          repr.plot.height = .default_repr_opts$height)
}



In [161]:
gfi_tbl <- paths$input |> file.path("gfi.csv") |> read_csv()
codebook_tbl <- paths$input |> file.path("codebook.csv") |> read_csv()

[1mRows: [22m[34m1057[39m [1mColumns: [22m[34m24[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (24): year_b, id_i, weight_d, respondent_age_o, respondent_education_lev...



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m66[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): vars, varname
[32mdbl[39m (1): vals

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [22]:
gfi_tbl |> names()

# Model

## My own

In [None]:
tmp <- gfi_tbl |>
  filter(
    respondent_age_o >= 15, # working age
    respondent_age_o <= 65, # excluding elderly
    received_government_pension_b == 0, # exluding retires, this variables has only 1 NA.
    !is.na(borrowed_in_the_past_year_c), # only three observations, I think it's safe to omit.
    !is.na(main_source_emergency_funds_c) # 11 observations, I will gamble and omit this also.
  ) |>
  mutate(
    financial_inclusion_i = case_when(
      used_phone_for_finance_b == 0 &
        owns_mobile_phone_b == 0 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 0 ~ 11,
      used_phone_for_finance_b == 0 &
        owns_mobile_phone_b == 1 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 0 ~ 12,
      used_phone_for_finance_b == 1 &
        owns_mobile_phone_b == 0 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 0 ~ 13,
      used_phone_for_finance_b == 1 &
        owns_mobile_phone_b == 1 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 0 ~ 14,
      used_phone_for_finance_b == 0 &
        owns_mobile_phone_b == 0 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 1 ~ 15,
      used_phone_for_finance_b == 0 &
        owns_mobile_phone_b == 1 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 1 ~ 16,
      used_phone_for_finance_b == 1 &
        owns_mobile_phone_b == 0 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 1 ~ 17,
      used_phone_for_finance_b == 1 &
        owns_mobile_phone_b == 1 &
        has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 1 ~ 18,

      # it already has 8 levels: from 0 to 7
      has_account_b == 0 ~ reason_no_account_c,
      has_financial_institution_account_b == 0 &
        has_mobile_money_account_b == 1 ~ 8,
      has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 0 ~ 9,
      has_financial_institution_account_b == 1 &
        has_mobile_money_account_b == 1 ~ 10,
      .default = NA_integer_
    ),
    active_account_i = case_when(
      has_account_b == 0 ~ 0,
      any_deposit_into_account_b == 0 & any_withdrawal_from_account_b == 0 ~ 1,
      any_deposit_into_account_b == 0 & any_withdrawal_from_account_b == 1 ~ 2,
      any_deposit_into_account_b == 1 & any_withdrawal_from_account_b == 0 ~ 3,
      any_deposit_into_account_b == 1 & any_withdrawal_from_account_b == 1 ~ 4,
      .default = 5
    )
  ) |>
  select(-c(
    # irrelevant
    id_i,
    payments_utility_bills_c,
    payments_wage_payments_c,

    # institutional FI
    has_account_b,
    has_financial_institution_account_b,
    reason_no_account_c,
    has_mobile_money_account_b,

    # redundancy to in_the_workforce_b
    received_government_pension_b,
    received_government_pension_payment_c,

    # digital FI
    made_or_received_digital_payment_b,
    used_phone_for_finance_b,
    owns_mobile_phone_b,
    any_deposit_into_account_b,
    any_withdrawal_from_account_b
  )) |>
  mutate(across(everything(), ~ replace_na(.x, 0))) |>
  mutate(across(
    c(
      respondent_education_level_o,
      household_income_quintile_o,
      main_source_emergency_funds_c,
      received_government_transfers_c,
      borrowed_in_the_past_year_c,
      financial_inclusion_i,
      active_account_i
    ),
    factor
  )) # |>  final gamble! the result is 885 observations.
# original gfi_tbl was 897 observations.
# total obsvervations where their NAs replaced by 0 ->  12: not good, not terrible (chernobyl joke!)


In [None]:
des <- svydesign(ids = ~1, weights = ~weight_d, data = tmp)
model_final <- svyglm(
  in_the_workforce_b ~
    year_b +
    respondent_age_o +
    respondent_education_level_o +
    household_income_quintile_o +
    main_source_emergency_funds_c +
    received_government_transfers_c +
    saved_in_past_year_b +
    borrowed_in_the_past_year_c +
    financial_inclusion_i +
    active_account_i,
  design = des,
  family = quasibinomial()
)
summary(model_final)
tidy(model_final, exponentiate = TRUE, conf.int = TRUE)


In [141]:
anti_join(tmp,tmp |> drop_na())

[1m[22mJoining with `by = join_by(year_b, weight_d, respondent_age_o,
respondent_education_level_o, in_the_workforce_b, household_income_quintile_o,
main_source_emergency_funds_c, received_government_transfers_c,
saved_in_past_year_b, borrowed_in_the_past_year_c, financial_inclusion_i,
active_account_i)`


year_b,weight_d,respondent_age_o,respondent_education_level_o,in_the_workforce_b,household_income_quintile_o,main_source_emergency_funds_c,received_government_transfers_c,saved_in_past_year_b,borrowed_in_the_past_year_c,financial_inclusion_i,active_account_i
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2017,0.6743416,48,0.0,0,4,0,0.0,0,1,,0
2017,1.9544701,40,1.0,0,1,2,,1,0,9.0,5
2017,2.9384096,18,1.0,0,0,1,,1,0,12.0,3
2017,1.3876194,54,,0,4,0,0.0,1,0,12.0,3
2017,2.9384096,17,1.0,0,2,2,0.0,1,0,,0
2017,1.8001288,54,1.0,0,2,2,,0,2,12.0,4
2017,0.7881909,37,2.0,1,2,2,0.0,1,1,,0
2017,1.998692,33,,0,3,0,3.0,0,1,12.0,4
2017,0.3151434,27,1.0,0,1,0,0.0,0,0,,0
2021,0.7598437,17,1.0,0,2,1,,1,0,12.0,1


'year_b''weight_d''respondent_age_o''respondent_education_level_o''in_the_workforce_b''household_income_quintile_o''main_source_emergency_funds_c''received_government_transfers_c''saved_in_past_year_b''any_deposit_into_account_b''any_withdrawal_from_account_b''borrowed_in_the_past_year_c''institutional_financial_inclusion_i''digital_financial_inclusion_i'

## Weird shit

In [None]:
analysis_tbl <- gfi_tbl |>
  filter(
    respondent_age_o >= 15,
    respondent_age_o <= 65,
    received_government_pension_b == 0,
    !is.na(borrowed_in_the_past_year_c),
    !is.na(main_source_emergency_funds_c)
  ) |>
  mutate(
    fin_incl_cat = case_when(
      has_account_b == 0 ~ "NoAccount",
      has_financial_institution_account_b == 1 & has_mobile_money_account_b == 0 ~ "AccountOnly",
      has_financial_institution_account_b == 1 & has_mobile_money_account_b == 1 & used_phone_for_finance_b == 0 ~ "Account+MobileMoney",
      has_financial_institution_account_b == 1 & used_phone_for_finance_b == 1 ~ "DigitalActive",
      TRUE ~ "Other"
    ),
    fin_incl_cat = relevel(
      factor(
        fin_incl_cat,
        levels = c("NoAccount", "AccountOnly", "Account+MobileMoney", "DigitalActive", "Other")
      ),
      ref = "AccountOnly"
    ),
    across(
      c(
        respondent_education_level_o, household_income_quintile_o,
        main_source_emergency_funds_c, received_government_transfers_c,
        borrowed_in_the_past_year_c
      ),
      ~ factor(replace_na(as.character(.x), "Missing"))
    ),
    year_c = year_b - mean(year_b)
  ) |>
  filter(
    respondent_education_level_o != "Missing",
    received_government_transfers_c != "Missing"
  )


In [164]:

# ---- 3. Survey design -------------------------------------------------------
des <- svydesign(ids = ~1, weights = ~weight_d, data = analysis_tbl)

# ---- 4. Main model ----------------------------------------------------------
form <- in_the_workforce_b ~ year_c + respondent_age_o +
  respondent_education_level_o + household_income_quintile_o +
  main_source_emergency_funds_c + received_government_transfers_c +
  saved_in_past_year_b + borrowed_in_the_past_year_c + fin_incl_cat

model_final <- svyglm(form, design = des, family = quasibinomial())

# ---- 5. Summaries -----------------------------------------------------------
print(summary(model_final))
orr_tbl <- broom::tidy(model_final, exponentiate = TRUE, conf.int = TRUE) |>
  filter(grepl("^fin_incl_cat", term))
print(orr_tbl)

# ---- 6. Average Marginal Effect (AME) --------------------------------------
pred_base <- predict(model_final,
                     newdata = transform(analysis_tbl, fin_incl_cat = "AccountOnly"),
                     type = "response")
pred_da   <- predict(model_final,
                     newdata = transform(analysis_tbl, fin_incl_cat = "DigitalActive"),
                     type = "response")
AME <- mean(pred_da - pred_base)
cat("\nAverage marginal effect (DigitalActive vs AccountOnly): ",
    round(AME, 4), "\n")

# ---- 6a. Bootstrap CI for AME (200 reps) -----------------------------------
set.seed(123)
B <- 200
ame_boot <- replicate(B, {
  idx <- sample(seq_len(nrow(analysis_tbl)), replace = TRUE)
  samp <- analysis_tbl[idx, ]
  des_b <- svydesign(ids = ~1, weights = ~weight_d, data = samp)
  mod_b <- svyglm(form, design = des_b, family = quasibinomial())
  pb  <- predict(mod_b, newdata = transform(samp, fin_incl_cat = "AccountOnly"), type = "response")
  pda <- predict(mod_b, newdata = transform(samp, fin_incl_cat = "DigitalActive"), type = "response")
  mean(pda - pb)
})
cat("95% bootstrap CI for AME: [",
    round(quantile(ame_boot, c(0.025, 0.975)), 4), "]\n")

# ---- 7. Predicted probabilities by inclusion tier --------------------------
pred_tbl <- analysis_tbl |>
  mutate(p_hat = predict(model_final, type = "response")) |>
  group_by(fin_incl_cat) |>
  summarise(predicted_prob = weighted.mean(p_hat, weight_d), n = n(), .groups = "drop")
print(pred_tbl)

# ---- 8. Robustness: account holders only -----------------------------------
model_accounts <- svyglm(form, design = des, subset = fin_incl_cat != "NoAccount", family = quasibinomial())
print(summary(model_accounts))



Call:
svyglm(formula = form, design = des, family = quasibinomial())

Survey design:
svydesign(ids = ~1, weights = ~weight_d, data = analysis_tbl)

Coefficients:
                                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)                      -1.113185   0.494810  -2.250  0.02471 *  
year_c                            0.170746   0.066845   2.554  0.01080 *  
respondent_age_o                 -0.005928   0.008317  -0.713  0.47613    
respondent_education_level_o1     0.383618   0.250173   1.533  0.12553    
respondent_education_level_o2     1.363239   0.280386   4.862 1.37e-06 ***
household_income_quintile_o1     -0.392570   0.263749  -1.488  0.13699    
household_income_quintile_o2     -0.223001   0.286933  -0.777  0.43725    
household_income_quintile_o3     -0.244739   0.283614  -0.863  0.38841    
household_income_quintile_o4     -0.137922   0.276179  -0.499  0.61763    
main_source_emergency_funds_c1    0.012484   0.350300   0.036  0.97158    
main_source_

In [153]:
gfi_tbl |> filter(respondent_age_o>59,respondent_age_o<65)

year_b,id_i,weight_d,respondent_age_o,respondent_education_level_o,in_the_workforce_b,household_income_quintile_o,main_source_emergency_funds_c,payments_utility_bills_c,has_account_b,⋯,received_government_pension_b,received_government_transfers_c,received_government_pension_payment_c,used_phone_for_finance_b,owns_mobile_phone_b,saved_in_past_year_b,any_deposit_into_account_b,any_withdrawal_from_account_b,reason_no_account_c,borrowed_in_the_past_year_c
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2017,41,0.2860515,60,0,0,1,2.0,2,1,⋯,1,1,1,,1,0,,,6.0,1
2017,175,0.9920167,60,1,0,0,0.0,2,1,⋯,0,3,0,0.0,1,0,0.0,1.0,,0
2017,202,0.4960084,60,1,0,0,0.0,2,1,⋯,1,0,1,0.0,1,0,1.0,1.0,,1
2017,426,1.2970108,60,0,0,4,0.0,3,1,⋯,1,1,1,0.0,0,0,1.0,1.0,,0
2017,498,0.5728374,64,0,0,0,0.0,2,1,⋯,0,1,0,,1,0,,,2.0,0
2017,633,1.8795881,60,1,0,3,2.0,1,1,⋯,1,1,1,,0,0,,,1.0,0
2017,878,0.9920167,63,1,0,4,5.0,3,1,⋯,0,1,0,1.0,1,1,1.0,1.0,,4
2017,961,0.9250796,63,1,0,4,1.0,2,1,⋯,1,0,1,0.0,1,0,0.0,0.0,,0
2017,975,0.9651505,64,1,0,2,0.0,2,1,⋯,1,1,1,0.0,0,0,1.0,1.0,,0
2021,18,0.5854931,63,0,0,4,2.0,1,1,⋯,0,0,0,0.0,1,0,1.0,0.0,,4
