analysis/table_1.R

######################################

# This script 
# - produces a table summarising selected clinical and demographic characteristics 
# - saves table as html

######################################

## Import libraries
library('plyr')
library('tidyverse')
library('here')
library('glue')
library('gt')
library('gtsummary')
library('reshape2')
library('fs')

## Import custom user functions
source(here::here("lib", "functions", "clean_table_names.R"))

## Import command-line arguments
args <- commandArgs(trailingOnly=TRUE)

## Set input and output pathways for matched/unmatched data - default is unmatched
if (length(args) == 0){
  data_label = "day5"
} else if (args[[1]]=="day0") {
  data_label = "day0"
} else if (args[[1]]=="day5") {
  data_label = "day5"
} else {
  # Print error if no argument specified
  stop("No outcome specified")
}

## Set rounding and redaction thresholds
rounding_threshold = 1
redaction_threshold = 10

## Import data
if (data_label=="day5") {
  data_cohort <- read_rds(here::here("output", "data", "data_processed_day5.rds"))
} else if (data_label == "day0") {
  data_cohort <- read_rds(here::here("output", "data", "data_processed_day0.rds"))
}

## Format data
data_cohort <- data_cohort %>%
  mutate(
    N = 1,
    allpop = "All"
  ) 

## Define variables of interest
counts <- data_cohort %>% 
  select(
    N,
    allpop,
    treatment_strategy_cat,
    
    ## Demographics
    ageband,
    sex,
    ethnicity,
    bmi_group,
    imdQ5,
    smoking_status,
  
    ## Clinical
    diabetes,
    copd,
    dialysis,
    cancer,
    lung_cancer,
    haem_cancer,
    
    ## Vaccination
    vaccination_status,
    tb_postest_vacc_cat,
    
    ## Variant
    variant,
    sgtf,
    
    ## High risk groups
    high_risk_group,
    huntingtons_disease_nhsd,
    myasthenia_gravis_nhsd,
    motor_neurone_disease_nhsd,
    multiple_sclerosis_nhsd,
    solid_organ_transplant_nhsd,
    hiv_aids_nhsd,
    immunosupression_nhsd,
    imid_nhsd,
    liver_disease_nhsd,
    ckd_stage_5_nhsd,
    haematological_disease_nhsd,
    cancer_opensafely_snomed,
    downs_syndrome_nhsd,  
    
    ## Geography
    region_nhs,
    rural_urban
  ) 

## Generate full and stratified table
pop_levels = c("All", "Molnupiravir", "Sotrovimab", "Untreated")

## Generate table - full and stratified populations
for (i in 1:length(pop_levels)) {
  
  if (i == 1) { 
    data_subset = counts
    counts_summary = data_subset %>% 
      select(-treatment_strategy_cat) %>% 
      tbl_summary(by = allpop,
                  statistic = everything() ~ "{n}")
    counts_summary$inputs$data <- NULL
  } else { 
    data_subset = subset(counts, treatment_strategy_cat==pop_levels[i]) 
    counts_summary = data_subset %>% 
      select(-treatment_strategy_cat) %>% 
      tbl_summary(by = allpop,
                  statistic = everything() ~ "{n}")
    counts_summary$inputs$data <- NULL
  }
  
  table1 <- counts_summary$table_body %>%
    filter(!is.na(stat_1)) %>%
    mutate(label = case_when(var_type == "dichotomous" ~ "",
                                TRUE ~ label)) %>%
    select(group = variable, variable = label, count = stat_1) %>%
    mutate(count = case_when(!is.na(count) ~ as.numeric(gsub(",", "", count)),
                             TRUE ~ NA_real_)) %>%
    mutate(percent = round(count/nrow(data_subset)*100, 1))
  colnames(table1) = c("Group", "Variable", "Count", "Percent")
  
  ## Clean names
  table1_clean = clean_table_names(table1)
  
  ## Calculate rounded total
  rounded_n = plyr::round_any(nrow(data_subset), rounding_threshold)
  
  ## Round individual values to rounding threshold
  table1_redacted <- table1_clean %>%
    mutate(Count = plyr::round_any(Count, rounding_threshold),
           Percent = round(Count/rounded_n*100,1),
           Non_Count = rounded_n - Count)
  
  ## Redact any rows with rounded cell counts or non-counts <= redaction threshold 
  table1_redacted$Summary = paste0(prettyNum(table1_redacted$Count, big.mark=",")," (",format(table1_redacted$Percent,nsmall=1),"%)")
  table1_redacted$Summary = gsub(" ", "", table1_redacted$Summary, fixed = TRUE) # Remove spaces generated by decimal formatting
  table1_redacted$Summary = gsub("(", " (", table1_redacted$Summary, fixed = TRUE) # Add first space before (
  table1_redacted$Summary[(table1_redacted$Count>0 & table1_redacted$Count<=redaction_threshold) | (table1_redacted$Non_Count>0 & table1_redacted$Non_Count<=redaction_threshold)] = "[Redacted]"
  table1_redacted$Summary[table1_redacted$Variable=="N"] = prettyNum(table1_redacted$Count[table1_redacted$Variable=="N"], big.mark=",")
  table1_redacted <- table1_redacted %>% select(-Non_Count, -Count, -Percent)
  names(table1_redacted)[3] = pop_levels[i]
  
  if (i==1) { 
    collated_table = table1_redacted 
  } else { 
    collated_table = collated_table %>% 
      left_join(table1_redacted, 
                by = c("Group" = "Group", "Variable" = "Variable")) 
    collated_table[,i+2][is.na(collated_table[,i+2])] = "--"
  }
}

## Create output directory
fs::dir_create(here("output", "tables"))

## Save as html/rds
file_name <- paste0("table1_redacted_", data_label)


gtsave(gt(collated_table), 
       filename = here("output", "tables", paste0(file_name, ".html")))
write_rds(collated_table,
          compress = "gz",
          path("output", "tables", paste0(file_name, ".rds")))