analysis/preprocess/data_eligible_ab.R

################################################################################

# This script:
# - reads the processed data
# - applies eligibility criteria from boxes a and b of Figure 3 in protocol
# - saves preocessed data from eligible individuals
# - records the age range in each JCVI group and saves for later use

################################################################################

## setup
library(tidyverse)
library(lubridate)
library(glue)

# read processed covariates data
data_covs <- readr::read_rds(
  here::here("output", "data", "data_covs.rds"))

# read wide vaccine dates data
data_vax_wide <- readr::read_rds(
  here::here("output", "data", "data_wide_vax_dates.rds"))

# count the number of patients in the extracted data
eligibility_count <- tribble(
  ~description, ~n,
  "Extracted using study_definition", n_distinct(data_covs$patient_id)
)

################################################################################
cat("#### apply exclusion criteria from box a to processed data ####\n")
# remove dummy jcvi_group
data_eligible_a <- data_covs %>%
  filter(!(jcvi_group %in% "99"))

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Samples with JCVI group 99 removed",
    n =  n_distinct(data_eligible_a$patient_id)
  )

# remove if any missing data for key variables
data_eligible_a <- data_eligible_a %>%
  filter(
    !is.na(ethnicity),
    !is.na(sex),
    !is.na(imd_0),
    !is.na(region_0))

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Samples with missing ethnicity, sex, imd, region removed.",
    n =  n_distinct(data_eligible_a$patient_id)
  )

# remove if evidence of covid infection on or before elig_date + 42 days
# COVID admission
data_eligible_a <- data_eligible_a %>%
  filter(is.na(covidadmitted_0_date) | 
           elig_date + days(42) <  covidadmitted_0_date)

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Samples with prior COVID admission removed.",
    n =  n_distinct(data_eligible_a$patient_id)
  )

# positive COVID test
data_eligible_a <- data_eligible_a %>%
  filter(is.na(positive_test_0_date) | 
           elig_date + days(42) <  positive_test_0_date)

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Samples with prior positive COVID test removed.",
    n =  n_distinct(data_eligible_a$patient_id)
  )

# probable COVID 
data_eligible_a <- data_eligible_a %>%
  filter(is.na(primary_care_covid_case_0_date) | 
           elig_date + days(42) <  primary_care_covid_case_0_date)

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Samples with prior probable COVID removed.",
    n =  n_distinct(data_eligible_a$patient_id)
  )

# suspected COVID 
data_eligible_a <- data_eligible_a %>%
  filter(is.na(primary_care_suspected_covid_0_date) | 
           elig_date + days(42) <  primary_care_suspected_covid_0_date) 

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Samples with prior suspected COVID removed.",
    n =  n_distinct(data_eligible_a$patient_id)
  )

readr::write_rds(data_eligible_a %>%
                   select(patient_id, jcvi_group, elig_date, region_0, ethnicity) %>%
                   droplevels(),
                 here::here("output", "data", "data_eligible_a.rds"),
                 compress="gz")

################################################################################
cat("#### apply exclusion criteria from box b to processed data ####\n")
data_eligible_b <- data_eligible_a %>%
  left_join(data_vax_wide, by = "patient_id") %>%
  mutate(
    between_doses = as.numeric(covid_vax_2_date - covid_vax_1_date)/7
  ) %>%
  filter(
    ### inclusion
    # first and second dose same brand
    covid_vax_1_brand == covid_vax_2_brand,
    # second dose must be az or pfizer
    covid_vax_1_brand %in% c("az", "pfizer"),
    # second dose received in interval [elig_date + 6 weeks, elig_date + 16 weeks)
    covid_vax_2_date >= elig_date + weeks(6),
    covid_vax_2_date < elig_date + weeks(16),
    
    ### exclusion
    # first dose received before eligibility date
    covid_vax_1_date > elig_date,
    # less than six or more than 14 weeks between 1st and 2nd dose
    between_doses >= 6,
    between_doses < 14,
    # flagged as hcw
    !hscworker
  ) %>%
  select(patient_id, jcvi_group, elig_date, region_0, ethnicity) %>%
  droplevels()

eligibility_count <- eligibility_count %>%
  add_row(
    description = "Criteria in box b applied.",
    n = n_distinct(data_eligible_b$patient_id)
  )

readr::write_rds(data_eligible_b,
                 here::here("output", "data", "data_eligible_b.rds"),
                 compress="gz")

# number of people eligible at each stage ----
eligibility_count <- eligibility_count %>%
  # round to nearest 10
  mutate(across(n, ~round(.x, -1))) %>%
  mutate(n_removed = lag(n) - n)

readr::write_csv(eligibility_count,
                 here::here("output", "lib", "eligibility_count_ab.csv"))


################################################################################
# jcvi_group, elig_date combos ----
fix_age <- data_covs %>%
  mutate(age = if_else(
    jcvi_group %in% c("10", "11", "12"),
    age_2,
    age_1)) %>%
  select(patient_id, age)

group_age_ranges <- data_eligible_b %>%
  left_join(fix_age, 
            by = "patient_id") %>%
  group_by(jcvi_group, elig_date) %>%
  summarise(min = min(age), max = max(age), .groups = "keep") %>%
  ungroup() 

# check none of the min / max correspond to < 5 individuals
check <- function(x) {
  fix_age %>%
    filter(age %in% x) %>%
    distinct(patient_id) %>%
    nrow(.)
}

group_age_ranges <- group_age_ranges %>%
  mutate(
    check_min = sapply(
      group_age_ranges$min,
      check),
    check_max = sapply(
      group_age_ranges$max,
      check)
  ) %>%
  mutate(age_range = case_when(
    max > 80 ~ glue("{min} +"),
    check_min < 5 | check_max < 5 ~ "[REDACTED]",
    TRUE ~ glue("{min} - {max}")
  )) %>%
  mutate(across(age_range, as.character)) %>%
  select(-ends_with(c("min", "max")))

# save as csv so that it can be checked
readr::write_csv(group_age_ranges,
                 here::here("output", "lib", "group_age_ranges.csv"))