In [None]:
# Define a vector of package names to be used in the script
package_names.vec <- c(
    "tidyverse",
    "ggtext",
    "showtext",
    "tidymodels",
    "furrr",
    "readxl",
    "tictoc",
    "glmnet"
)

# Define a function to install and load required packages
package_prep <- \(names.vec) {
  # Iterate through each package name in the provided vector
  for (name in names.vec) {
    # Check if the package is already installed
    if (!require(name, character.only = TRUE)) {
      # Install the package using renv if not already installed
      renv::install(name)
    }
  }
}

# Pipe the vector of package names into the package_prep function
package_names.vec |> package_prep()

options(repr.plot.width = 12, repr.plot.height = 8, repr.plot.res = 300)

Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: ggtext

Loading required package: showtext

Loading required package: sysfonts

Loading required package: 

# Cleaning the Data

In [14]:
# Define a function to load and process a table
# Parameters:
# - name: Name of the table to be assigned in the global environment
# - path: Path to the file to be read
# - argyear: Year to be added as a column in the resulting table
# - reading_func: Function to read the file (e.g., read_csv, read_excel)
load_tbl <- function(name, path, argyear, reading_func) {
  # Read the file, process it, and store it in a variable 'result'
  result <- file.path(path) |>  # Generate the full file path
    reading_func() |>           # Read the file using the provided reading function
    mutate(
      year = argyear,           # Add a 'year' column with the specified year
      ID = row_number()         # Add an 'ID' column with sequential row numbers
    ) |> 
    select(where(~!all(is.na(.x)))) # Select only the columns that are not entirely NA
  
  # Assign the processed table to the specified name in the global environment
  assign(name, result, envir = .GlobalEnv)
}

# Call the 'load_tbl' function to load and process data for the year 2021
load_tbl("gfi_2021.tbl",               # Name of the output table
  file.path("/kaggle", "input", "gfi-iran", "micro_irn.csv"),  # File path
  2021,                               # Year to assign
  read_csv                            # Reading function
)

# Call the 'load_tbl' function to load and process data for the year 2017 (variable labels)
load_tbl("gfi_2017_1.tbl",            # Name of the output table
  file.path("/kaggle", "input", "gfi-iran", "micro_irn_varlabel.xls"),  # File path
  2017,                               # Year to assign
  read_excel                          # Reading function
)

# Call the 'load_tbl' function to load and process data for the year 2017 (variable names)
load_tbl("gfi_2017_2.tbl",            # Name of the output table
  file.path("/kaggle", "input", "gfi-iran", "micro_irn_varname.xls"),  # File path
  2017,                               # Year to assign
  read_excel                          # Reading function
)

# Replace spaces, colons, and hyphens in column names of `gfi_2017_1.tbl` with underscores.
new_names <- gsub("[ :-]", "_", gfi_2017_1.tbl |> names())

# Replace consecutive underscores (if any) with a single underscore in the column names.
new_names2 <- gsub("_+", "_", new_names)

# Rename the columns of `gfi_2017_1.tbl` using the cleaned-up column names.
gfi_2017_renamed.tbl <- gfi_2017_1.tbl |> rename(
  !!!setNames(
    names(gfi_2017_1.tbl),  # Current column names of the table.
    new_names2              # Updated column names after transformations.
  )
) |>
select(-c('Economy','Economy_Code','Gallup_World_Poll_identifier'))


# Define the named vector with keys in double quotes (replaced to become similar with 2017 column names.)
names_dict_2021 <- c(
  "saved" = "Saved_in_the_past_year",
"borrowed" = "Borrowed_in_the_past_year",
"receive_wages" = "Received_wage_payments_in_past_12_months",
"receive_transfers" = "Received_a_government_transfer_payment",
"receive_pension" = "Received_a_government_pension_payment",
"pay_utilities" = "Payments_utility_bills",
"mobileowner" = "Owns_a_mobile_phone",
"internetaccess" = "Internet_access",
"anydigpayment" = "Made_or_received_a_digital_payment",
"fin34a" = "Received_wage_payments_into_an_account",
"fin34b" = "Received_wage_payments_to_a_mobile_phone",
"fin34d" = "Received_wage_payments_in_cash",
"fin34e" = "Received_wage_payments_to_a_card",
"fin37" = "Payments_government_transfers",
"fin38" = "Received_a_government_pension",
"fin39a" = "Received_a_government_transfer_or_pension_into_an_account",
"fin39b" = "Received_a_government_transfer_or_pension_to_a_mobile_phone",
"fin39d" = "Received_a_government_transfer_or_pension_in_cash",
"fin39e" = "Received_a_government_transfer_or_pension_to_a_card",
"fin44a" = "Financially_worried_old_age",
"fin44b" = "Financially_worried_medical_cost",
"fin44c" = "Financially_worried_bills",
"fin44d" = "Financially_worried_education",
"fin45" = "Financially_most_worried",
"fin16" = "Saved_for_old_age",
"fin17a" = "Saved_using_an_account_at_a_financial_institution",
"fin17a1" = "Saved_using_a_mobile_money_account",
"fin20" = "Borrowed_for_medical_purposes",
"fin22a" = "Borrowed_in_past_12_months_from_a_financial_institution",
"fin22b" = "Borrowed_from_family_or_friends",
"fin24" = "Main_source_of_emergency_funds",
"fin24a" = "Difficulty_of_emergency_funds_in_30_days",
"fin24b" = "Difficulty_of_emergency_funds_in_7_days",
"fin30" = "Paid_a_utility_bill",
"fin31a" = "Paid_a_utility_bill_using_an_account",
"fin31b" = "Paid_a_utility_bill_using_a_mobile_phone",
"fin31c" = "Paid_a_utility_bill_in_cash",
"fin32" = "Payments_wage_payments",
"fin33" = "Received_public_sector_wage_payments",
"fin11b" = "Reason_for_no_account_too_expensive",
"fin11c" = "Reason_for_no_account_lack_documentation",
"fin11d" = "Reason_for_no_account_lack_trust",
"fin11e" = "Reason_for_no_account_religious_reasons",
"fin11f" = "Reason_for_no_account_lack_money",
"fin11g" = "Reason_for_no_account_family_member_already_has_one",
"fin11h" = "Reason_for_no_account_no_need_for_financial_services",
"fin13a" = "Use_mobile_money_account_two_or_more_times_a_month",
"fin13b" = "Use_mobile_money_account_to_store_money",
"fin13c" = "Use_mobile_money_account_to_borrow_money",
"fin13d" = "Use_mobile_money_account_without_help",
"fin14_1" = "Use_mobile_phone_to_pay_for_a_purchase_in_store",
"fin14a" = "Made_bill_payments_online_using_the_Internet",
"fin14a1" = "Send_money_to_a_relative_or_friend_online_using_the_Internet",
"fin14b" = "Bought_something_online_using_the_Internet",
"account_mob" = "Has_a_mobile_money_account",
"fin2" = "Has_a_debit_card",
"fin4" = "Used_a_debit_card",
"fin5" = "Used_a_mobile_phone_or_internet_to_access_account",
"fin6" = "Used_a_mobile_phone_or_internet_to_check_account_balance",
"fin7" = "Has_a_credit_card",
"fin8" = "Used_a_credit_card",
"fin8b" = "Paid_credit_card_balances_in_full",
"fin9" = "If_has_account_any_deposit_into_account_in_past_12_months",
"fin9a" = "Make_deposits_into_the_account_two_or_more_times_per_month",
"fin10" = "If_has_account_any_withdrawal_from_account_in_past_12_months",
"fin10a" = "Withdrew_from_the_account_two_or_more_times_per_month",
"fin10b" = "Used_account_to_store_money",
"fin11_1" = "Unbanked_use_account_without_help",
"fin11a" = "Reason_for_no_account_too_far",
"economy" = "Economy",
"economycode" = "Economy_Code",
"wpid_random" = "Gallup_World_Poll_identifier",
"wgt" = "Weight",
"female" = "Respondent_is_female",
"age" = "Respondent_age",
"educ" = "Respondent_education_level",
"inc_q" = "Within_economy_household_income_quintile",
"emp_in" = "Respondent_is_in_the_workforce",
"account" = "Has_an_account",
"account_fin" = "Has_an_account_at_a_financial_institution",
"year" = "year",
"ID" = "ID"
)
vars.vec <- c('Respondent_age','Has_an_account_at_a_financial_institution',
           'Has_an_account','Has_a_mobile_money_account','Weight','ID','Respondent_is_female',
           'Respondent_education_level','Respondent_is_in_the_workforce','year',
           'Within_economy_household_income_quintile',"Owns_a_mobile_phone","Saved_in_the_past_year",
            "If_has_account_any_deposit_into_account_in_past_12_months", 
              "If_has_account_any_withdrawal_from_account_in_past_12_months", 
                "Received_wage_payments_in_past_12_months")
              "Borrowed_in_the_past_year",#"Made_or_received_a_digital_payment",
              "Borrowed_in_past_12_months_from_a_financial_institution")

gfi_2021_renamed.tbl  <- gfi_2021.tbl |> 
  rename(
    !!!setNames(
      names(gfi_2021.tbl), 
      names_dict_2021[names(gfi_2021.tbl)]
    )
) |> 
select(-c('Economy','Economy_Code','Gallup_World_Poll_identifier'))

ERROR: Error in parse(text = x, srcfile = src): <text>:150:42: unexpected ','
149:                 "Received_wage_payments_in_past_12_months")
150:               "Borrowed_in_the_past_year",
                                              ^


In [16]:
gfi_2017_renamed.tbl |> select(
    "Borrowed_in_the_past_year"
   # ,"Made_or_received_a_digital_payment"
    ,"Borrowed_in_past_12_months_from_a_financial_institution"
                              ) |> map(~unique(.x)) |> print()

$Borrowed_in_the_past_year
[1] "yes" "0"  

$Borrowed_in_past_12_months_from_a_financial_institution
[1] "no"   "yes"  "(dk)"



In [8]:
gfi_2021_renamed.tbl |> select("Borrowed_in_the_past_year","Made_or_received_a_digital_payment",
              "Borrowed_from_a_financial_institution") |> map(~unique(.x)) |> print()

$Borrowed_in_the_past_year
[1] 1 0

$Made_or_received_a_digital_payment
[1] 1 0

$Borrowed_from_a_financial_institution
[1] 2 1



In [13]:
gfi_2017_renamed.tbl |>
select(-all_of(vars.vec)) |> 
names() |> print()

 [1] "Has_a_debit_card"                                                    
 [2] "If_has_debit_card_card_in_own_name"                                  
 [3] "If_has_debit_card_used_card_in_past_12_months"                       
 [4] "Used_mobile_phone_or_internet_to_access_FI_account"                  
 [5] "Used_mobile_phone_or_internet_to_check_account_balance"              
 [6] "Has_a_credit_card"                                                   
 [7] "If_has_credit_card_used_card_in_past_12_months"                      
 [8] "If_does_not_have_account_b/c_too_far_away"                           
 [9] "If_does_not_have_account_b/c_too_expensive"                          
[10] "If_does_not_have_account_b/c_lack_documentation"                     
[11] "If_does_not_have_account_b/c_lack_trust"                             
[12] "If_does_not_have_account_b/c_religious_reasons"                      
[13] "If_does_not_have_account_b/c_lack_of_money"                          
[14] "If_doe

In [3]:
non_binary_vars.vec <- c('Respondent_age','Weight','ID','Respondent_is_female','Respondent_education_level',
                         'Respondent_is_in_the_workforce','year','Within_economy_household_income_quintile')

binary_vars.vec <- vars.vec[!vars.vec %in% non_binary_vars.vec]


combined.tbl <- gfi_2017_renamed.tbl |> select(all_of(vars.vec)) |> 
  
# Mutate to clean and standardize categorical variables
  mutate(
    Respondent_education_level = case_match(
      Respondent_education_level,
      c("(dk)", "(rf)") ~ NA, # Replace "don't know" or "refused" responses with NA
      .default = Respondent_education_level
    ),
    Respondent_age = case_when(
      is.na(Respondent_age) ~ NA, # Keep missing values as NA
      Respondent_age == "99+" ~ 100, # Convert "99+" to a numeric value
      .default = as.integer(Respondent_age) |> suppressWarnings() # Convert other values to integers
    ),
    Received_wage_payments_in_past_12_months = case_match(
        Received_wage_payments_in_past_12_months,
        c("(dk)", "(rf)") ~ NA, # Replace "don't know" or "refused" responses with NA
        .default = Received_wage_payments_in_past_12_months
    ),
    across(
        all_of(binary_vars.vec),
        ~{case_match(
            .x,
            "yes" ~ "Yes",
            "0" ~ "No",
            "no" ~ "No",
            .default = NA
        ) |> factor(levels = c("No","Yes"),ordered = T)}
    )      
) |> 
bind_rows(
    gfi_2021_renamed.tbl |> select(all_of(vars.vec))  |> 
  # Mutate to clean and standardize categorical variables using case_match.
  mutate(
    # Map numeric values of Respondent_is_female to meaningful labels.
    Respondent_is_female = case_match(
      Respondent_is_female,
      1 ~ "Female",
      2 ~ "Male",
      .default = NA
    ),
    # Map education level codes to descriptive labels.
    Respondent_education_level = case_match(
      Respondent_education_level,
      1 ~ "completed primary or less",
      2 ~ "secondary",
      3 ~ "completed tertiary or more",
      .default = NA
    ),
    # Map workforce participation status to descriptive labels.
    Respondent_is_in_the_workforce = case_match(
      Respondent_is_in_the_workforce,
      1 ~ "in workforce",
      2 ~ "out of workforce",
      .default = NA
    ),
    # Map income quintiles to descriptive labels for within-economy household income.
    Within_economy_household_income_quintile = case_match(
      Within_economy_household_income_quintile,
      1 ~ "Poorest 20%",
      2 ~ "Second 20%",
      3 ~ "Middle 20%",
      4 ~ "Fourth 20%",
      5 ~ "Richest 20%",
      .default = NA
    ),
      Received_wage_payments_in_past_12_months = case_match(
          Received_wage_payments_in_past_12_months,
          c(1,2,3) ~ 1,
          4 ~ 0,
          .default = NA
      ),
      across(
          all_of(binary_vars.vec),
          ~{case_match(
              .x,
              1 ~ "Yes",
              c(0,2) ~ "No",
              .default = NA
          ) |> factor(levels = c("No","Yes"),ordered = T)}
      )
  )
)|>
mutate(
    year = factor(
        year,
        levels = c("2017", "2021"),
        ordered = T
        ),
    # Reorder the `Respondent_is_female` column as a factor (Male first, Female second)
    Respondent_is_female = factor(
        Respondent_is_female, 
        levels = c("Male", "Female"),
        ordered =T
        ),
    Respondent_is_in_the_workforce = factor(
      Respondent_is_in_the_workforce, 
        levels = c("out of workforce", "in workforce"),
        ordered = T
    ),
    Within_economy_household_income_quintile  = factor(
        Within_economy_household_income_quintile, 
        levels = c(
            "Poorest 20%",
            "Second 20%",
            "Middle 20%",
            "Fourth 20%",
            "Richest 20%"
        ),
        ordered = T
    ),
    Respondent_education_level = factor(
        Respondent_education_level, 
        levels = c(
            "completed primary or less",
            "secondary",
            "completed tertiary or more"
        ),
        ordered = T
    )
) |> rename(
    phone = Owns_a_mobile_phone,
    saved = Saved_in_the_past_year,
    female = Respondent_is_female,
    in_wf = Respondent_is_in_the_workforce,
    inc_q = Within_economy_household_income_quintile,
    educ = Respondent_education_level,
    age = Respondent_age,
    fin_account = Has_an_account_at_a_financial_institution,
    account = Has_an_account,
    mm_account = Has_a_mobile_money_account,
    weight = Weight,
    id = ID,
    depos = If_has_account_any_deposit_into_account_in_past_12_months, 
    withdraw = If_has_account_any_withdrawal_from_account_in_past_12_months, 
    wage = Received_wage_payments_in_past_12_months
) |> filter(female == "Female") |> select(-female)

# Process the combined data table to calculate weighted sampling
weighted_sum <- combined.tbl |> 
  mutate(weighted_s = weight / sum(weight)) |>      # Create a column with weights normalized to sum to 1
  slice_sample(                                     # Perform weighted random sampling on the dataset
    n = 1e6,                                       # Number of samples to draw (1 million)
    weight_by = weighted_s,                        # Weights column used for sampling probabilities
    replace = TRUE                                 # Allow sampling with replacement
  )

combined.tbl

age,fin_account,account,mm_account,weight,id,educ,in_wf,year,inc_q,phone,saved,depos,withdraw,wage
<dbl>,<ord>,<ord>,<ord>,<dbl>,<int>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>
42,Yes,Yes,No,0.6743416,2,completed primary or less,out of workforce,2017,Poorest 20%,Yes,No,Yes,Yes,No
35,Yes,Yes,No,0.9317940,4,secondary,out of workforce,2017,Middle 20%,Yes,No,,,No
37,Yes,Yes,No,1.2513574,8,completed primary or less,out of workforce,2017,Middle 20%,No,No,,,No
20,Yes,Yes,No,1.9209096,9,secondary,out of workforce,2017,Middle 20%,No,Yes,Yes,Yes,No
40,Yes,Yes,No,0.5956590,11,secondary,out of workforce,2017,Fourth 20%,No,Yes,Yes,Yes,No
76,Yes,Yes,No,0.9954276,13,completed tertiary or more,out of workforce,2017,Second 20%,No,No,Yes,Yes,No
48,Yes,Yes,No,1.0036016,14,completed primary or less,in workforce,2017,Fourth 20%,No,No,Yes,Yes,No
37,Yes,Yes,Yes,0.3924855,16,secondary,in workforce,2017,Richest 20%,Yes,No,Yes,Yes,No
32,Yes,Yes,No,1.9986920,18,completed primary or less,out of workforce,2017,Poorest 20%,No,No,Yes,Yes,No
33,Yes,Yes,Yes,0.6895992,19,secondary,out of workforce,2017,Middle 20%,Yes,No,No,Yes,No
