In [None]:
non_binary_vars.vec <- c('Respondent_age','Weight','ID','Respondent_is_female','Respondent_education_level',
                         'Respondent_is_in_the_workforce','year','Within_economy_household_income_quintile')

binary_vars.vec <- vars.vec[!vars.vec %in% non_binary_vars.vec]


combined.tbl <- gfi_2017_renamed.tbl |> select(all_of(vars.vec)) |> 
  
# Mutate to clean and standardize categorical variables
  mutate(
    Respondent_education_level = case_match(
      Respondent_education_level,
      c("(dk)", "(rf)") ~ NA, # Replace "don't know" or "refused" responses with NA
      .default = Respondent_education_level
    ),
    Respondent_age = case_when(
      is.na(Respondent_age) ~ NA, # Keep missing values as NA
      Respondent_age == "99+" ~ 100, # Convert "99+" to a numeric value
      .default = as.integer(Respondent_age) |> suppressWarnings() # Convert other values to integers
    ),
    Received_wage_payments_in_past_12_months = case_match(
        Received_wage_payments_in_past_12_months,
        c("(dk)", "(rf)") ~ NA, # Replace "don't know" or "refused" responses with NA
        .default = Received_wage_payments_in_past_12_months
    ),
    across(
        all_of(binary_vars.vec),
        ~{case_match(
            .x,
            "yes" ~ "Yes",
            "0" ~ "No",
            "no" ~ "No",
            .default = NA
        ) |> factor(levels = c("No","Yes"),ordered = T)}
    )      
) |> 
bind_rows(
    gfi_2021_renamed.tbl |> select(all_of(vars.vec))  |> 
  # Mutate to clean and standardize categorical variables using case_match.
  mutate(
    # Map numeric values of Respondent_is_female to meaningful labels.
    Respondent_is_female = case_match(
      Respondent_is_female,
      1 ~ "Female",
      2 ~ "Male",
      .default = NA
    ),
    # Map education level codes to descriptive labels.
    Respondent_education_level = case_match(
      Respondent_education_level,
      1 ~ "completed primary or less",
      2 ~ "secondary",
      3 ~ "completed tertiary or more",
      .default = NA
    ),
    # Map workforce participation status to descriptive labels.
    Respondent_is_in_the_workforce = case_match(
      Respondent_is_in_the_workforce,
      1 ~ "in workforce",
      2 ~ "out of workforce",
      .default = NA
    ),
    # Map income quintiles to descriptive labels for within-economy household income.
    Within_economy_household_income_quintile = case_match(
      Within_economy_household_income_quintile,
      1 ~ "Poorest 20%",
      2 ~ "Second 20%",
      3 ~ "Middle 20%",
      4 ~ "Fourth 20%",
      5 ~ "Richest 20%",
      .default = NA
    ),
      Received_wage_payments_in_past_12_months = case_match(
          Received_wage_payments_in_past_12_months,
          c(1,2,3) ~ 1,
          4 ~ 0,
          .default = NA
      ),
      across(
          all_of(binary_vars.vec),
          ~{case_match(
              .x,
              1 ~ "Yes",
              c(0,2) ~ "No",
              .default = NA
          ) |> factor(levels = c("No","Yes"),ordered = T)}
      )
  )
)|>
mutate(
    year = factor(
        year,
        levels = c("2017", "2021"),
        ordered = T
        ),
    # Reorder the `Respondent_is_female` column as a factor (Male first, Female second)
    Respondent_is_female = factor(
        Respondent_is_female, 
        levels = c("Male", "Female"),
        ordered =T
        ),
    Respondent_is_in_the_workforce = factor(
      Respondent_is_in_the_workforce, 
        levels = c("out of workforce", "in workforce"),
        ordered = T
    ),
    Within_economy_household_income_quintile  = factor(
        Within_economy_household_income_quintile, 
        levels = c(
            "Poorest 20%",
            "Second 20%",
            "Middle 20%",
            "Fourth 20%",
            "Richest 20%"
        ),
        ordered = T
    ),
    Respondent_education_level = factor(
        Respondent_education_level, 
        levels = c(
            "completed primary or less",
            "secondary",
            "completed tertiary or more"
        ),
        ordered = T
    ),
    Respondent_age = Respondent_age |> as.integer() |> suppressWarnings(),
) |> rename(
    phone = Owns_a_mobile_phone,
    saved = Saved_in_the_past_year,
    female = Respondent_is_female,
    in_wf = Respondent_is_in_the_workforce,
    inc_q = Within_economy_household_income_quintile,
    educ = Respondent_education_level,
    age = Respondent_age,
    fin_account = Has_an_account_at_a_financial_institution,
    account = Has_an_account,
    mm_account = Has_a_mobile_money_account,
    weight = Weight,
    id = ID,
    depos = If_has_account_any_deposit_into_account_in_past_12_months, 
    withdraw = If_has_account_any_withdrawal_from_account_in_past_12_months, 
    wage = Received_wage_payments_in_past_12_months
)

# Process the combined data table to calculate weighted sampling
weighted_sum <- combined.tbl |> 
  mutate(weighted_s = weight / sum(weight)) |>      # Create a column with weights normalized to sum to 1
  slice_sample(                                     # Perform weighted random sampling on the dataset
    n = 1e6,                                       # Number of samples to draw (1 million)
    weight_by = weighted_s,                        # Weights column used for sampling probabilities
    replace = TRUE                                 # Allow sampling with replacement
  )

combined.tbl

[38;5;246m# A tibble: 2,009 × 16[39m
     age fin_account account mm_account weight    id female educ                       in_wf            year  inc_q       phone saved depos withdraw wage 
   [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<ord>[39m[23m       [3m[38;5;246m<ord>[39m[23m   [3m[38;5;246m<ord>[39m[23m       [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<ord>[39m[23m  [3m[38;5;246m<ord>[39m[23m                      [3m[38;5;246m<ord>[39m[23m            [3m[38;5;246m<ord>[39m[23m [3m[38;5;246m<ord>[39m[23m       [3m[38;5;246m<ord>[39m[23m [3m[38;5;246m<ord>[39m[23m [3m[38;5;246m<ord>[39m[23m [3m[38;5;246m<ord>[39m[23m    [3m[38;5;246m<ord>[39m[23m
[38;5;250m 1[39m    19 No          Yes     Yes         2.17      1 Male   secondary                  out of workforce 2017  Second 20%  Yes   Yes   [31mNA[39m    [31mNA[39m       No   
[38;5;250m 2[39m    42 Yes         Yes     No          0.6