analysis/analysis_wave2_severe.R

# Import data
os_data <- read.csv('./output/input_w2.csv')
# WAVE 2 DATA
# On local computer, filepath is ~/Documents/GitHub/DISECT_UK_India_COVID/output

# Load Libraries
library(rms)
library(survival)
library(broom)
library(tidyverse)

####################################
# DATA PROCESSING
####################################

# To avoid complications later, may want to replace all empty strings with NA
# Otherwise, many NAs could to unnoticed
os_data[os_data == ""] <- NA

# Filter out bad records (missing sex, age, or IMD)
os_data <- os_data %>% filter(!is.na(sex), !is.na(age), !is.na(imdQ5), !imdQ5 == 'Unknown')


# Create household generational composition
# MUST be done before filtering out adults.

# Kevin Wing defined generations as
# 0-17 year olds, 18-29 year olds, 30-66 year olds and 67+ year olds
# The categorised into single occupant, single gen, 2 gen, 3 gen, or 4 gen household
# First create generation category
os_data <- os_data %>% mutate(gen_cat = case_when(age < 18 ~ "Child",
                                                  age >= 18 & age < 30 ~ "Young adult",
                                                  age >= 30 & age <67 ~ "Adult",
                                                  age >= 67 ~ "Elder",
                                                  TRUE ~ "Unknown"))

# Then create generation # for each HH
hh_gens <- os_data %>% group_by(hh_id) %>% dplyr::summarize(
  children = I(sum(gen_cat == "Child") > 0),
  young_adults = I(sum(gen_cat == "Young adult") > 0),
  adults = I(sum(gen_cat == "Adult") > 0),
  elders = I(sum(gen_cat == 'Elder') > 0)) %>% rowwise() %>%
  mutate(gen_hh = sum(children + young_adults + adults + elders)) %>% select(hh_id, gen_hh)


# And merge this column to the dataset
os_data <- left_join(os_data, hh_gens, by = 'hh_id')
# NOTE: This variable currently does not distinguish between 
# multiple people of SAME gen in hh and single occupant hh
# Could transform using info from hh_size?

# Note that we may want to add some flexibility 
# e.g. household of just 29 year old and 30 year old is not multigenerational in reality
# Also, household cannot have more generations than members
# E.g. not possible to have 3 generation household if just 2 people live in it
# QUESTION: Do we want to have indicator of something like
# child AND senior in household?
# As one existing hypothesis is that transmission to elderly is FROM schoolchildren
# So a multigen household including 67+ and 0-17 may want to be modeled as distinct from, e.g.,
# 29 year old and 30 year old.

# Final note, may be possible that there are more members of the household
# Who aren't in the original data, or had missing age and were filtered out
# So, potential for measurement error exists. Interpret with caution!

# Filter for adults at baseline
os_data <- os_data %>% filter(age >= 18)

# Filter out those who had the outcome in previous wave

# Note, JUST filters out those who had covid admission before wave start
# Don't think we need to filter out death? Since if they died before wave start
# I don't think they would appear in the new data
os_data <- os_data %>% filter((is.na(covid_admission_date) | as.Date(covid_admission_date) >= "2020-11-01"))
  

# Correct labels for ethnicity variables
os_data <- os_data %>% mutate(ethnicity = case_when(ethnicity == 1 ~ "White",
                                                    ethnicity == 2 ~ "South Asian",
                                                    ethnicity == 3 ~ "Black",
                                                    ethnicity == 4 ~ "Mixed", 
                                                    ethnicity == 5 ~ "Other",
                                                    is.na(ethnicity) ~ "Unknown",
                                                    TRUE ~ "Unknown"),
                              ethnicity_16 = case_when(ethnicity_16 == 1 ~ "White British",
                                                       ethnicity_16 == 2 ~ "White Irish",
                                                       ethnicity_16 == 3 ~ "Other White",
                                                       ethnicity_16 == 4 ~ "White + Caribbean",
                                                       ethnicity_16 == 5 ~ "White + African",
                                                       ethnicity_16 == 6 ~ "White + Asian",
                                                       ethnicity_16 == 7 ~ "Other mixed",
                                                       ethnicity_16 == 8 ~ "Indian",
                                                       ethnicity_16 == 9 ~ "Pakistani",
                                                       ethnicity_16 == 10 ~ "Bangladeshi",
                                                       ethnicity_16 == 11 ~ "Other Asian",
                                                       ethnicity_16 == 12 ~ "Caribbean",
                                                       ethnicity_16 == 13 ~ "African",
                                                       ethnicity_16 == 14 ~ "Other Black",
                                                       ethnicity_16 == 15 ~ "Chinese",
                                                       ethnicity_16 == 16 ~ "Other",
                                                       is.na(ethnicity_16) ~ "Unknown",
                                                       TRUE ~ "Unknown")) 

# Create categories for vaccination
os_data <- os_data %>% mutate(vax_cat = case_when(is.na(covid_vaccine_1) ~ "No doses",
                                                  is.na(covid_vaccine_2) & !is.na(covid_vaccine_1) ~ "One Dose",
                                                  is.na(covid_vaccine_3) & !is.na(covid_vaccine_1) & !is.na(covid_vaccine_2) ~ "Two Doses",
                                                  !is.na(covid_vaccine_3) & !is.na(covid_vaccine_1) & !is.na(covid_vaccine_2) ~ "3+ Doses",
                                                  TRUE ~ "Unknown"))

# Binarise hypertension
os_data$hypertension_flag <- ifelse(!is.na(os_data$hypertension), 1, 0)
os_data$hypertension_flag_char <- ifelse(!is.na(os_data$hypertension), 'Hypertension', 'No hypertension') # For easier tables
# Binarise chronic cardiac disease
os_data$cardiac_flag <- ifelse(!is.na(os_data$chronic_cardiac_disease), 1, 0)
# Binarise chronic kidney disease
os_data$ckd_flag <- ifelse(!is.na(os_data$ckd), 1, 0)
# Binarise diabetes
os_data$diabetes_flag <- ifelse(is.na(os_data$diabetes_type) | os_data$diabetes_type == "NO_DM", 0, 1)
# BP Lowering
os_data$bp_lower <- ifelse(os_data$combination_bp_meds > 0, 1, 0)
# Binarise statin
os_data$statin_flag <- ifelse(!is.na(os_data$statin), 1, 0)
# Ace inhibitors
os_data$ace_flag <- ifelse(os_data$ace_inhibitors > 0, 1, 0)
# Alpha blockers
os_data$alpha_flag <- ifelse(os_data$alpha_blockers > 0, 1, 0)
# Arbs
os_data$arbs_flag <- ifelse(os_data$arbs > 0, 1, 0)
# Beta blockers
os_data$beta_flag <- ifelse(os_data$betablockers > 0, 1, 0)
# Calcium channel blockers
os_data$calc_flag <- ifelse(os_data$calcium_channel_blockers > 0, 1, 0)
# Spironolactone
os_data$spiro_flag <- ifelse(os_data$spironolactone > 0, 1, 0)
# Thiazide diruetics
os_data$thiaz_flag <- ifelse(os_data$thiazide_diuretics > 0, 1, 0)
# Insulin
os_data$insulin_flag <- ifelse(!is.na(os_data$insulin), 1, 0)
# OAD
os_data$oad_flag <- ifelse(os_data$oad_med > 0, 1, 0)

# Binarise obesity (Using WHO cutoff for Asian individuals)
os_data$obese <- ifelse((os_data$ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian") & os_data$bmi >= 27.5) | os_data$bmi >= 30, 'Obese', 'Not obese')
# Binarise Vit D deficiency OR medication
os_data$vit_d <- ifelse(is.na(os_data$vit_d_deficient) & is.na(os_data$vit_d_prescript), 0, 1)

# Create categories for BMI
os_data <- os_data %>% mutate(bmi_cat = case_when(bmi <18.5  ~ "Underweight",
                                                  (bmi >= 18.5 & bmi <23 & ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian")) | (bmi >= 18.5 & bmi < 25 & !ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian")) ~ "Normal weight",
                                                  (bmi >= 23 & bmi <27.5 & ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian")) | (bmi >= 25 & bmi < 30 & !ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian")) ~ "Overweight",
                                                  (bmi >= 27.5 & ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian")) | (bmi >= 30 & !ethnicity_16 %in% c("Indian", "Pakistani", "Bangladeshi", "Chinese", "Other Asian")) ~ "Obese",
                                                  TRUE ~ "no category"))


# Recall our objectives
# A.To describe the incidence rate of severe COVID-19 (hospitalization, death, and both combined) according to strata of: age group, sex, ethnicity group, NCD group (diabetes, hypertension, obesity), and time period/wave of the pandemic.
# B.To describe the incidence rate of Long COVID according to strata of: age group, sex, ethnicity group, NCD group (diabetes, hypertension, obesity), and time period/wave of the pandemic.

# Time periods of interest are defined as follows:
# 23rd March 2020 to 31st October 2020 (Wave 1),
# 1st November 2020 to 31st March 2021 (wave 2),
# 1st April 2021 to 31st November 2021 (easing restrictions and introduction of widespread vaccination)
# 1st December 2021 to 30th April 2022 (Omicron wave).

# Age reported in 5 year bands for stratification
os_data <- os_data %>% mutate(age_cat = case_when(age < 25 ~ "18-24",
                                                  age < 30 & age >= 25 ~ "25-29",
                                                  age < 35 & age >= 30 ~ "30-34",
                                                  age < 40 & age >= 35 ~ "35-39",
                                                  age < 45 & age >= 40 ~ "40-44",
                                                  age < 50 & age >= 45 ~ "45-49",
                                                  age < 55 & age >= 50 ~ "50-54",
                                                  age < 60 & age >= 55 ~ "55-59",
                                                  age < 65 & age >= 60 ~ "60-64",
                                                  age < 70 & age >= 65 ~ "65-69",
                                                  age < 75 & age >= 70 ~ "70-74",
                                                  age < 80 & age >= 75 ~ "75-79",
                                                  age < 85 & age >= 80 ~ "80-84",
                                                  age >= 85 ~ "85+"))

####################################
# CREATE OUTCOME VARIABLES
####################################

# OUTCOME A: Severe Covid
# Two criteria are:
# COVID-19 hospitalization (defined as a COVID-19 ICD-10 code in the primary diagnosis field, ascertained from SUS data)
# COVID-19 related death defined as a COVID-19 ICD-10 code anywhere on the death certificate (ascertained from ONS death certificate data). 

# Set deregistration date, death, long covid date, or TPP linkage as outcome - whichever is EARLIEST
# Death date has DAY included, long covid and de-registration do NOT
# Add 15 as date to these
os_data$covid_hosp_date <- as.Date(os_data$covid_admission_date,  format = "%Y-%m-%d")
#os_data$covid_death_date <- as.Date(paste(os_data$long_covid_date,"-15",sep=""),  format = "%Y-%m-%d")
# covid_death_date variable is binary flag for now, use death date + flag to determine if outcome happened
os_data$dereg_date <- as.Date(paste(os_data$dereg_date,"-15",sep=""),  format = "%Y-%m-%d")
os_data$died_date_ons <- as.Date(os_data$died_date_ons, format = "%Y-%m-%d")

# Give everyone an "End of wave" date, to use in calculating the min
os_data$wave_end <- rep(as.Date("2021-03-31", format = "%Y-%m-%d"), nrow(os_data))

# Determine minimum of these dates
os_data <- os_data %>% rowwise() %>% 
  mutate(severe_covid_outcome_date = as.Date(min(as.numeric(covid_hosp_date), as.numeric(dereg_date), as.numeric(died_date_ons), as.numeric(wave_end), na.rm = TRUE), format = "%Y-%m-%d", origin = "1970-01-01"))

# And create event flag
os_data$severe_covid_flag <- ifelse(!is.na(os_data$covid_hosp_date) | os_data$died_ons_covid_flag_any == 1, 1, 0)

# Generate survival object for Cox analyses
os_data$severe_covid_surv <- survival::Surv(as.numeric(os_data$severe_covid_outcome_date)-rep(as.numeric(as.Date("2020-11-01", format = "%Y-%m-%d")), nrow(os_data)), 
                                            os_data$severe_covid_flag)

# For sensitivity, create extra object where only covid death is outcome (not hospitalisation)
os_data <- os_data %>% rowwise() %>% 
  mutate(death_covid_outcome_date = as.Date(min(as.numeric(dereg_date), as.numeric(died_date_ons), as.numeric(wave_end), na.rm = TRUE), format = "%Y-%m-%d", origin = "1970-01-01"))

os_data$death_covid_flag <- ifelse(!is.na(os_data$died_date_ons) & os_data$died_date_ons < as.Date('2021-04-01', format = '%Y-%m-%d'), 1, 0)

os_data$death_covid_surv <- survival::Surv(as.numeric(os_data$death_covid_outcome_date)-rep(as.numeric(as.Date("2020-11-01", format = "%Y-%m-%d")), nrow(os_data)), 
                                            os_data$death_covid_flag)

####################################
# SUMMARY TABLES
####################################


# TABLE 1s
# We will describe the proportion of individuals within each ethnicity category and outcome category, and their baseline covariate status at the start of each study period


# Table 1a - Columns = ethnicity (5)
table1a <- os_data %>% group_by(ethnicity) %>% 
  dplyr::summarize(N = n(),
                   mean_age = mean(age),
                   sd_age = sd(age),
                   male = sum(sex == 'M', na.rm = TRUE),
                   male_p = sum(sex == 'M', na.rm = TRUE)/n(),
                   female = sum(sex == 'F', na.rm = TRUE),
                   female_p = sum(sex == 'F', na.rm = TRUE)/n(),
                   mean_bmi = mean(bmi, na.rm = TRUE),
                   sd_bmi = sd(bmi, na.rm = TRUE),
                   underweight = sum(bmi_cat == "Underweight", na.rm = TRUE),
                   underweight_p = sum(bmi_cat == "Underweight", na.rm = TRUE)/n(),
                   normalweight = sum(bmi_cat == "Normal weight", na.rm = TRUE),
                   normalweight_p = sum(bmi_cat == "Normal weight", na.rm = TRUE)/n(),
                   overweight = sum(bmi_cat == "Overweight", na.rm = TRUE),
                   overweight_p = sum(bmi_cat == "Overweight", na.rm = TRUE)/n(),
                   obese = sum(bmi_cat == "Obese", na.rm = TRUE),
                   obese_p = sum(bmi_cat == "Obese", na.rm = TRUE)/n(),
                   
                   # Smoking
                   current_smoke = sum(smoking_status == 'S', na.rm = TRUE),
                   current_smoke_p = sum(smoking_status == 'S', na.rm = TRUE)/N,
                   ever_smoke = sum(smoking_status == 'E', na.rm = TRUE),
                   ever_smoke_p = sum(smoking_status == 'E', na.rm = TRUE)/N,
                   non_smoke = sum(smoking_status == 'N', na.rm = TRUE),
                   non_smoke_p = sum(smoking_status == 'N', na.rm = TRUE)/N,
                   missing_smoke = sum(smoking_status == 'M' | is.na(smoking_status)),
                   missing_smoke_p = sum(smoking_status == 'M' | is.na(smoking_status))/N,
                   # QUESTION - Missing as its own category or combine with non-smoker?
                   
                   # IMD
                   imd_1 = sum(imdQ5 == "1 (most deprived)"),
                   imd_1_p = sum(imdQ5 == "1 (most deprived)")/N,
                   imd_2 = sum(imdQ5 == "2"),
                   imd_2_p = sum(imdQ5 == "2")/N,
                   imd_3 = sum(imdQ5 == "3"),
                   imd_3_p = sum(imdQ5 == "3")/N,
                   imd_4 = sum(imdQ5 == "4"),
                   imd_4_p = sum(imdQ5 == "4")/N,
                   imd_5 = sum(imdQ5 == "5 (least deprived)"),
                   imd_5_p = sum(imdQ5 == "5 (least deprived)")/N,
                   
                   # Geography (STP)
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   
                   # Eligibility for shielding
                   eligible_shield = sum(shielding == 1, na.rm = TRUE),
                   eligible_shield = sum(shielding == 1, na.rm = TRUE)/N,
                   
                   # Co-morbidities: T1DM, T2DM, hypertension, CVD, CKD
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE),
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE)/N,
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE),
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE)/N,
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE),
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE)/N,
                   hypertens = sum(hypertension_flag == 1),
                   hypertens_p = sum(hypertension_flag == 1)/N,
                   chronic_cardiac = sum(cardiac_flag == 1),
                   chronic_cardiac_p = sum(cardiac_flag == 1)/N,
                   chronic_kidney = sum(ckd_flag == 1),
                   chronic_kidney_p = sum(ckd_flag == 1)/N,
                   
                   # Medications: antidiabetic, BP lowering, lipid lowering
                   bp_meds = sum(combination_bp_meds > 0, na.rm = TRUE),
                   bp_meds_p = sum(combination_bp_meds > 0, na.rm = TRUE)/N,
                   statins = sum(statin_flag == 1),
                   statins_p = sum(statin_flag == 1)/N,
                   ace = sum(ace_flag == 1),
                   ace_p = sum(ace_flag == 1)/N,
                   alpha = sum(alpha_flag == 1),
                   alpha_p = sum(alpha_flag == 1)/N,
                   arbs = sum(arbs_flag == 1),
                   arbs_p = sum(arbs_flag == 1)/N,
                   beta = sum(beta_flag == 1),
                   beta_p = sum(beta_flag == 1)/N,
                   calcium = sum(calc_flag == 1),
                   calcium_p = sum(calc_flag == 1)/N,
                   spiro = sum(spiro_flag == 1),
                   spiro_p = sum(spiro_flag == 1)/N,
                   thiaz = sum(thiaz_flag == 1),
                   thiaz_p = sum(thiaz_flag == 1)/N,
                   insulin = sum(insulin_flag == 1),
                   insulin_p = sum(insulin_flag == 1)/N,
                   oad = sum(oad_flag == 1),
                   oad_p = sum(oad_flag == 1)/N,
                   
                   # Date of all previous COVID-19 diagnoses in primary care 
                   # first_positive_test_date variable, not sure how to report in table?
                   
                   # Date of all COVID-19 vaccinations
                   # QUESTION: How to describe?
                   # Currently, provide categorical information on number of doses
                   no_vax = sum(vax_cat == "No doses", na.rm = TRUE),
                   no_vax_p = sum(vax_cat == "No doses", na.rm = TRUE)/n(),
                   one_vax = sum(vax_cat == "One dose", na.rm = TRUE),
                   one_vax_p = sum(vax_cat == "One dose", na.rm = TRUE)/n(),
                   two_vax = sum(vax_cat == "Two doses", na.rm = TRUE),
                   two_vax_p = sum(vax_cat == "Two doses", na.rm = TRUE)/n(),
                   three_vax = sum(vax_cat == "3+ doses", na.rm = TRUE),
                   three_vax_p = sum(vax_cat == "3+ doses", na.rm = TRUE)/n(),
                   
                   # Household composition: Household size (number of people living in a household), generational composition (single generation, two generation, or multi-generation)
                   hh_size_mean = mean(hh_size, na.rm = TRUE),
                   hh_size_sd = sd(hh_size, na.rm = TRUE),
                   
                   # Generational composition
                   hh_1gen = sum(gen_hh == 1, na.rm = TRUE),
                   hh_1gen_p = sum(gen_hh == 1, na.rm = TRUE)/n(),
                   hh_2gen = sum(gen_hh == 2, na.rm = TRUE),
                   hh_2gen_p = sum(gen_hh == 2, na.rm = TRUE)/n(),
                   hh_3gen = sum(gen_hh == 3, na.rm = TRUE),
                   hh_3gen_p = sum(gen_hh == 3, na.rm = TRUE)/n(),
                   hh_4gen = sum(gen_hh == 4, na.rm = TRUE),
                   hh_4gen_p = sum(gen_hh == 4, na.rm = TRUE)/n(),
                   
                   # Care home residents
                   care_home = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE),
                   care_home_p = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE)/N,
                   # QUESTION: Didn't know what different codes meant, unfortunately, so just aggregated... Can correct as needed
                   
                   # Vitamin D
                   vitd = sum(vit_d == 1, na.rm = TRUE),
                   vitd_p = sum(vit_d == 1, na.rm = TRUE)/n()
                   
                   # Previous infections
  ) 

# Add Overall column
table1a_overall <- os_data %>% mutate(ethnicity_all = 'Overall') %>% 
  group_by(ethnicity_all) %>% 
  dplyr::summarize(N = n(),
                   mean_age = mean(age),
                   sd_age = sd(age),
                   male = sum(sex == 'M', na.rm = TRUE),
                   male_p = sum(sex == 'M', na.rm = TRUE)/n(),
                   female = sum(sex == 'F', na.rm = TRUE),
                   female_p = sum(sex == 'F', na.rm = TRUE)/n(),
                   mean_bmi = mean(bmi, na.rm = TRUE),
                   sd_bmi = sd(bmi, na.rm = TRUE),
                   underweight = sum(bmi_cat == "Underweight", na.rm = TRUE),
                   underweight_p = sum(bmi_cat == "Underweight", na.rm = TRUE)/n(),
                   normalweight = sum(bmi_cat == "Normal weight", na.rm = TRUE),
                   normalweight_p = sum(bmi_cat == "Normal weight", na.rm = TRUE)/n(),
                   overweight = sum(bmi_cat == "Overweight", na.rm = TRUE),
                   overweight_p = sum(bmi_cat == "Overweight", na.rm = TRUE)/n(),
                   obese = sum(bmi_cat == "Obese", na.rm = TRUE),
                   obese_p = sum(bmi_cat == "Obese", na.rm = TRUE)/n(),
                   
                   # Smoking
                   current_smoke = sum(smoking_status == 'S', na.rm = TRUE),
                   current_smoke_p = sum(smoking_status == 'S', na.rm = TRUE)/N,
                   ever_smoke = sum(smoking_status == 'E', na.rm = TRUE),
                   ever_smoke_p = sum(smoking_status == 'E', na.rm = TRUE)/N,
                   non_smoke = sum(smoking_status == 'N', na.rm = TRUE),
                   non_smoke_p = sum(smoking_status == 'N', na.rm = TRUE)/N,
                   missing_smoke = sum(smoking_status == 'M' | is.na(smoking_status)),
                   missing_smoke_p = sum(smoking_status == 'M' | is.na(smoking_status))/N,
                   # QUESTION - Missing as its own category or combine with non-smoker?
                   
                   # IMD
                   imd_1 = sum(imdQ5 == "1 (most deprived)"),
                   imd_1_p = sum(imdQ5 == "1 (most deprived)")/N,
                   imd_2 = sum(imdQ5 == "2"),
                   imd_2_p = sum(imdQ5 == "2")/N,
                   imd_3 = sum(imdQ5 == "3"),
                   imd_3_p = sum(imdQ5 == "3")/N,
                   imd_4 = sum(imdQ5 == "4"),
                   imd_4_p = sum(imdQ5 == "4")/N,
                   imd_5 = sum(imdQ5 == "5 (least deprived)"),
                   imd_5_p = sum(imdQ5 == "5 (least deprived)")/N,
                   
                   # Geography (STP)
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   
                   # Eligibility for shielding
                   eligible_shield = sum(shielding == 1, na.rm = TRUE),
                   eligible_shield = sum(shielding == 1, na.rm = TRUE)/N,
                   
                   # Co-morbidities: T1DM, T2DM, hypertension, CVD, CKD
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE),
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE)/N,
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE),
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE)/N,
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE),
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE)/N,
                   hypertens = sum(hypertension_flag == 1),
                   hypertens_p = sum(hypertension_flag == 1)/N,
                   chronic_cardiac = sum(cardiac_flag == 1),
                   chronic_cardiac_p = sum(cardiac_flag == 1)/N,
                   chronic_kidney = sum(ckd_flag == 1),
                   chronic_kidney_p = sum(ckd_flag == 1)/N,
                   
                   # Medications: antidiabetic, BP lowering, lipid lowering
                   bp_meds = sum(combination_bp_meds > 0, na.rm = TRUE),
                   bp_meds_p = sum(combination_bp_meds > 0, na.rm = TRUE)/N,
                   statins = sum(statin_flag == 1),
                   statins_p = sum(statin_flag == 1)/N,
                   ace = sum(ace_flag == 1),
                   ace_p = sum(ace_flag == 1)/N,
                   alpha = sum(alpha_flag == 1),
                   alpha_p = sum(alpha_flag == 1)/N,
                   arbs = sum(arbs_flag == 1),
                   arbs_p = sum(arbs_flag == 1)/N,
                   beta = sum(beta_flag == 1),
                   beta_p = sum(beta_flag == 1)/N,
                   calcium = sum(calc_flag == 1),
                   calcium_p = sum(calc_flag == 1)/N,
                   spiro = sum(spiro_flag == 1),
                   spiro_p = sum(spiro_flag == 1)/N,
                   thiaz = sum(thiaz_flag == 1),
                   thiaz_p = sum(thiaz_flag == 1)/N,
                   insulin = sum(insulin_flag == 1),
                   insulin_p = sum(insulin_flag == 1)/N,
                   oad = sum(oad_flag == 1),
                   oad_p = sum(oad_flag == 1)/N,
                   
                   # Date of all previous COVID-19 diagnoses in primary care 
                   # first_positive_test_date variable, not sure how to report in table?
                   
                   # Date of all COVID-19 vaccinations
                   # QUESTION: How to describe?
                   # Currently, provide categorical information on number of doses
                   no_vax = sum(vax_cat == "No doses", na.rm = TRUE),
                   no_vax_p = sum(vax_cat == "No doses", na.rm = TRUE)/n(),
                   one_vax = sum(vax_cat == "One dose", na.rm = TRUE),
                   one_vax_p = sum(vax_cat == "One dose", na.rm = TRUE)/n(),
                   two_vax = sum(vax_cat == "Two doses", na.rm = TRUE),
                   two_vax_p = sum(vax_cat == "Two doses", na.rm = TRUE)/n(),
                   three_vax = sum(vax_cat == "3+ doses", na.rm = TRUE),
                   three_vax_p = sum(vax_cat == "3+ doses", na.rm = TRUE)/n(),
                   
                   # Household composition: Household size (number of people living in a household), generational composition (single generation, two generation, or multi-generation)
                   hh_size_mean = mean(hh_size, na.rm = TRUE),
                   hh_size_sd = sd(hh_size, na.rm = TRUE),
                   
                   # Generational composition
                   hh_1gen = sum(gen_hh == 1, na.rm = TRUE),
                   hh_1gen_p = sum(gen_hh == 1, na.rm = TRUE)/n(),
                   hh_2gen = sum(gen_hh == 2, na.rm = TRUE),
                   hh_2gen_p = sum(gen_hh == 2, na.rm = TRUE)/n(),
                   hh_3gen = sum(gen_hh == 3, na.rm = TRUE),
                   hh_3gen_p = sum(gen_hh == 3, na.rm = TRUE)/n(),
                   hh_4gen = sum(gen_hh == 4, na.rm = TRUE),
                   hh_4gen_p = sum(gen_hh == 4, na.rm = TRUE)/n(),
                   
                   # Care home residents
                   care_home = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE),
                   care_home_p = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE)/N,
                   # QUESTION: Didn't know what different codes meant, unfortunately, so just aggregated... Can correct as needed
                   
                   # Vitamin D
                   vitd = sum(vit_d == 1, na.rm = TRUE),
                   vitd_p = sum(vit_d == 1, na.rm = TRUE)/n()
                   
                   # Previous infections
  )

# Modify column name so tables will join properly
colnames(table1a_overall)[1] <- 'ethnicity'

# Paste together with rbind
table1a <- rbind(table1a, table1a_overall)

# Note, probably should also round the decimals. Otherwise, very difficult to read.

# QUESTION: Remove care home residents at which step?

# Table 1b - Columns = ethnicity (16)
# Replicate code for 1a once complete
table1b <- os_data %>% group_by(ethnicity_16) %>% 
  dplyr::summarize(N = n(),
                   mean_age = mean(age),
                   sd_age = sd(age),
                   male = sum(sex == 'M', na.rm = TRUE),
                   male_p = sum(sex == 'M', na.rm = TRUE)/n(),
                   female = sum(sex == 'F', na.rm = TRUE),
                   female_p = sum(sex == 'F', na.rm = TRUE)/n(),
                   mean_bmi = mean(bmi, na.rm = TRUE),
                   sd_bmi = sd(bmi, na.rm = TRUE),
                   underweight = sum(bmi_cat == "Underweight", na.rm = TRUE),
                   underweight_p = sum(bmi_cat == "Underweight", na.rm = TRUE)/n(),
                   normalweight = sum(bmi_cat == "Normal weight", na.rm = TRUE),
                   normalweight_p = sum(bmi_cat == "Normal weight", na.rm = TRUE)/n(),
                   overweight = sum(bmi_cat == "Overweight", na.rm = TRUE),
                   overweight_p = sum(bmi_cat == "Overweight", na.rm = TRUE)/n(),
                   obese = sum(bmi_cat == "Obese", na.rm = TRUE),
                   obese_p = sum(bmi_cat == "Obese", na.rm = TRUE)/n(),
                   
                   # Smoking
                   current_smoke = sum(smoking_status == 'S', na.rm = TRUE),
                   current_smoke_p = sum(smoking_status == 'S', na.rm = TRUE)/N,
                   ever_smoke = sum(smoking_status == 'E', na.rm = TRUE),
                   ever_smoke_p = sum(smoking_status == 'E', na.rm = TRUE)/N,
                   non_smoke = sum(smoking_status == 'N', na.rm = TRUE),
                   non_smoke_p = sum(smoking_status == 'N', na.rm = TRUE)/N,
                   missing_smoke = sum(smoking_status == 'M' | is.na(smoking_status)),
                   missing_smoke_p = sum(smoking_status == 'M' | is.na(smoking_status))/N,
                   # QUESTION - Missing as its own category or combine with non-smoker?
                   
                   # IMD
                   imd_1 = sum(imdQ5 == "1 (most deprived)"),
                   imd_1_p = sum(imdQ5 == "1 (most deprived)")/N,
                   imd_2 = sum(imdQ5 == "2"),
                   imd_2_p = sum(imdQ5 == "2")/N,
                   imd_3 = sum(imdQ5 == "3"),
                   imd_3_p = sum(imdQ5 == "3")/N,
                   imd_4 = sum(imdQ5 == "4"),
                   imd_4_p = sum(imdQ5 == "4")/N,
                   imd_5 = sum(imdQ5 == "5 (least deprived)"),
                   imd_5_p = sum(imdQ5 == "5 (least deprived)")/N,
                   
                   # Geography (STP)
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   
                   # Eligibility for shielding
                   eligible_shield = sum(shielding == 1, na.rm = TRUE),
                   eligible_shield = sum(shielding == 1, na.rm = TRUE)/N,
                   
                   # Co-morbidities: T1DM, T2DM, hypertension, CVD, CKD
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE),
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE)/N,
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE),
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE)/N,
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE),
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE)/N,
                   hypertens = sum(hypertension_flag == 1),
                   hypertens_p = sum(hypertension_flag == 1)/N,
                   chronic_cardiac = sum(cardiac_flag == 1),
                   chronic_cardiac_p = sum(cardiac_flag == 1)/N,
                   chronic_kidney = sum(ckd_flag == 1),
                   chronic_kidney_p = sum(ckd_flag == 1)/N,
                   
                   # Medications: antidiabetic, BP lowering, lipid lowering
                   bp_meds = sum(combination_bp_meds > 0, na.rm = TRUE),
                   bp_meds_p = sum(combination_bp_meds > 0, na.rm = TRUE)/N,
                   statins = sum(statin_flag == 1),
                   statins_p = sum(statin_flag == 1)/N,
                   ace = sum(ace_flag == 1),
                   ace_p = sum(ace_flag == 1)/N,
                   alpha = sum(alpha_flag == 1),
                   alpha_p = sum(alpha_flag == 1)/N,
                   arbs = sum(arbs_flag == 1),
                   arbs_p = sum(arbs_flag == 1)/N,
                   beta = sum(beta_flag == 1),
                   beta_p = sum(beta_flag == 1)/N,
                   calcium = sum(calc_flag == 1),
                   calcium_p = sum(calc_flag == 1)/N,
                   spiro = sum(spiro_flag == 1),
                   spiro_p = sum(spiro_flag == 1)/N,
                   thiaz = sum(thiaz_flag == 1),
                   thiaz_p = sum(thiaz_flag == 1)/N,
                   insulin = sum(insulin_flag == 1),
                   insulin_p = sum(insulin_flag == 1)/N,
                   oad = sum(oad_flag == 1),
                   oad_p = sum(oad_flag == 1)/N,
                   
                   # Date of all previous COVID-19 diagnoses in primary care 
                   # first_positive_test_date variable, not sure how to report in table?
                   
                   # Date of all COVID-19 vaccinations
                   # QUESTION: How to describe?
                   # Currently, provide categorical information on number of doses
                   no_vax = sum(vax_cat == "No doses", na.rm = TRUE),
                   no_vax_p = sum(vax_cat == "No doses", na.rm = TRUE)/n(),
                   one_vax = sum(vax_cat == "One dose", na.rm = TRUE),
                   one_vax_p = sum(vax_cat == "One dose", na.rm = TRUE)/n(),
                   two_vax = sum(vax_cat == "Two doses", na.rm = TRUE),
                   two_vax_p = sum(vax_cat == "Two doses", na.rm = TRUE)/n(),
                   three_vax = sum(vax_cat == "3+ doses", na.rm = TRUE),
                   three_vax_p = sum(vax_cat == "3+ doses", na.rm = TRUE)/n(),
                   
                   # Household composition: Household size (number of people living in a household), generational composition (single generation, two generation, or multi-generation)
                   hh_size_mean = mean(hh_size, na.rm = TRUE),
                   hh_size_sd = sd(hh_size, na.rm = TRUE),
                   
                   # Generational composition
                   hh_1gen = sum(gen_hh == 1, na.rm = TRUE),
                   hh_1gen_p = sum(gen_hh == 1, na.rm = TRUE)/n(),
                   hh_2gen = sum(gen_hh == 2, na.rm = TRUE),
                   hh_2gen_p = sum(gen_hh == 2, na.rm = TRUE)/n(),
                   hh_3gen = sum(gen_hh == 3, na.rm = TRUE),
                   hh_3gen_p = sum(gen_hh == 3, na.rm = TRUE)/n(),
                   hh_4gen = sum(gen_hh == 4, na.rm = TRUE),
                   hh_4gen_p = sum(gen_hh == 4, na.rm = TRUE)/n(),
                   
                   # Care home residents
                   care_home = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE),
                   care_home_p = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE)/N,
                   # QUESTION: Didn't know what different codes meant, unfortunately, so just aggregated... Can correct as needed
                   
                   # Vitamin D
                   vitd = sum(vit_d == 1, na.rm = TRUE),
                   vitd_p = sum(vit_d == 1, na.rm = TRUE)/n()
                   
                   # Previous infections
  ) 

# Add overall column
table1b_overall <- os_data %>% mutate(ethnicity_16_all = 'All') %>%
  group_by(ethnicity_16_all) %>% 
  dplyr::summarize(N = n(),
                   mean_age = mean(age),
                   sd_age = sd(age),
                   male = sum(sex == 'M', na.rm = TRUE),
                   male_p = sum(sex == 'M', na.rm = TRUE)/n(),
                   female = sum(sex == 'F', na.rm = TRUE),
                   female_p = sum(sex == 'F', na.rm = TRUE)/n(),
                   mean_bmi = mean(bmi, na.rm = TRUE),
                   sd_bmi = sd(bmi, na.rm = TRUE),
                   underweight = sum(bmi_cat == "Underweight", na.rm = TRUE),
                   underweight_p = sum(bmi_cat == "Underweight", na.rm = TRUE)/n(),
                   normalweight = sum(bmi_cat == "Normal weight", na.rm = TRUE),
                   normalweight_p = sum(bmi_cat == "Normal weight", na.rm = TRUE)/n(),
                   overweight = sum(bmi_cat == "Overweight", na.rm = TRUE),
                   overweight_p = sum(bmi_cat == "Overweight", na.rm = TRUE)/n(),
                   obese = sum(bmi_cat == "Obese", na.rm = TRUE),
                   obese_p = sum(bmi_cat == "Obese", na.rm = TRUE)/n(),
                   
                   # Smoking
                   current_smoke = sum(smoking_status == 'S', na.rm = TRUE),
                   current_smoke_p = sum(smoking_status == 'S', na.rm = TRUE)/N,
                   ever_smoke = sum(smoking_status == 'E', na.rm = TRUE),
                   ever_smoke_p = sum(smoking_status == 'E', na.rm = TRUE)/N,
                   non_smoke = sum(smoking_status == 'N', na.rm = TRUE),
                   non_smoke_p = sum(smoking_status == 'N', na.rm = TRUE)/N,
                   missing_smoke = sum(smoking_status == 'M' | is.na(smoking_status)),
                   missing_smoke_p = sum(smoking_status == 'M' | is.na(smoking_status))/N,
                   # QUESTION - Missing as its own category or combine with non-smoker?
                   
                   # IMD
                   imd_1 = sum(imdQ5 == "1 (most deprived)"),
                   imd_1_p = sum(imdQ5 == "1 (most deprived)")/N,
                   imd_2 = sum(imdQ5 == "2"),
                   imd_2_p = sum(imdQ5 == "2")/N,
                   imd_3 = sum(imdQ5 == "3"),
                   imd_3_p = sum(imdQ5 == "3")/N,
                   imd_4 = sum(imdQ5 == "4"),
                   imd_4_p = sum(imdQ5 == "4")/N,
                   imd_5 = sum(imdQ5 == "5 (least deprived)"),
                   imd_5_p = sum(imdQ5 == "5 (least deprived)")/N,
                   
                   # Geography (STP)
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   stp_1 = sum(stp == "STP1", na.rm = TRUE),
                   stp_1_p = sum(stp == "STP1", na.rm = TRUE)/n(),
                   stp_2 = sum(stp == "STP2", na.rm = TRUE),
                   stp_2_p = sum(stp == "STP2", na.rm = TRUE)/n(),
                   stp_3 = sum(stp == "STP3", na.rm = TRUE),
                   stp_3_p = sum(stp == "STP3", na.rm = TRUE)/n(),
                   stp_4 = sum(stp == "STP4", na.rm = TRUE),
                   stp_4_p = sum(stp == "STP4", na.rm = TRUE)/n(),
                   stp_5 = sum(stp == "STP5", na.rm = TRUE),
                   stp_5_p = sum(stp == "STP5", na.rm = TRUE)/n(),
                   
                   # Eligibility for shielding
                   eligible_shield = sum(shielding == 1, na.rm = TRUE),
                   eligible_shield = sum(shielding == 1, na.rm = TRUE)/N,
                   
                   # Co-morbidities: T1DM, T2DM, hypertension, CVD, CKD
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE),
                   t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE)/N,
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE),
                   t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE)/N,
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE),
                   dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE)/N,
                   hypertens = sum(hypertension_flag == 1),
                   hypertens_p = sum(hypertension_flag == 1)/N,
                   chronic_cardiac = sum(cardiac_flag == 1),
                   chronic_cardiac_p = sum(cardiac_flag == 1)/N,
                   chronic_kidney = sum(ckd_flag == 1),
                   chronic_kidney_p = sum(ckd_flag == 1)/N,
                   
                   # Medications: antidiabetic, BP lowering, lipid lowering
                   bp_meds = sum(combination_bp_meds > 0, na.rm = TRUE),
                   bp_meds_p = sum(combination_bp_meds > 0, na.rm = TRUE)/N,
                   statins = sum(statin_flag == 1),
                   statins_p = sum(statin_flag == 1)/N,
                   ace = sum(ace_flag == 1),
                   ace_p = sum(ace_flag == 1)/N,
                   alpha = sum(alpha_flag == 1),
                   alpha_p = sum(alpha_flag == 1)/N,
                   arbs = sum(arbs_flag == 1),
                   arbs_p = sum(arbs_flag == 1)/N,
                   beta = sum(beta_flag == 1),
                   beta_p = sum(beta_flag == 1)/N,
                   calcium = sum(calc_flag == 1),
                   calcium_p = sum(calc_flag == 1)/N,
                   spiro = sum(spiro_flag == 1),
                   spiro_p = sum(spiro_flag == 1)/N,
                   thiaz = sum(thiaz_flag == 1),
                   thiaz_p = sum(thiaz_flag == 1)/N,
                   insulin = sum(insulin_flag == 1),
                   insulin_p = sum(insulin_flag == 1)/N,
                   oad = sum(oad_flag == 1),
                   oad_p = sum(oad_flag == 1)/N,
                   
                   # Date of all previous COVID-19 diagnoses in primary care 
                   # first_positive_test_date variable, not sure how to report in table?
                   
                   # Date of all COVID-19 vaccinations
                   # QUESTION: How to describe?
                   # Currently, provide categorical information on number of doses
                   no_vax = sum(vax_cat == "No doses", na.rm = TRUE),
                   no_vax_p = sum(vax_cat == "No doses", na.rm = TRUE)/n(),
                   one_vax = sum(vax_cat == "One dose", na.rm = TRUE),
                   one_vax_p = sum(vax_cat == "One dose", na.rm = TRUE)/n(),
                   two_vax = sum(vax_cat == "Two doses", na.rm = TRUE),
                   two_vax_p = sum(vax_cat == "Two doses", na.rm = TRUE)/n(),
                   three_vax = sum(vax_cat == "3+ doses", na.rm = TRUE),
                   three_vax_p = sum(vax_cat == "3+ doses", na.rm = TRUE)/n(),
                   
                   # Household composition: Household size (number of people living in a household), generational composition (single generation, two generation, or multi-generation)
                   hh_size_mean = mean(hh_size, na.rm = TRUE),
                   hh_size_sd = sd(hh_size, na.rm = TRUE),
                   
                   # Generational composition
                   hh_1gen = sum(gen_hh == 1, na.rm = TRUE),
                   hh_1gen_p = sum(gen_hh == 1, na.rm = TRUE)/n(),
                   hh_2gen = sum(gen_hh == 2, na.rm = TRUE),
                   hh_2gen_p = sum(gen_hh == 2, na.rm = TRUE)/n(),
                   hh_3gen = sum(gen_hh == 3, na.rm = TRUE),
                   hh_3gen_p = sum(gen_hh == 3, na.rm = TRUE)/n(),
                   hh_4gen = sum(gen_hh == 4, na.rm = TRUE),
                   hh_4gen_p = sum(gen_hh == 4, na.rm = TRUE)/n(),
                   
                   # Care home residents
                   care_home = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE),
                   care_home_p = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE)/N,
                   # QUESTION: Didn't know what different codes meant, unfortunately, so just aggregated... Can correct as needed
                   
                   # Vitamin D
                   vitd = sum(vit_d == 1, na.rm = TRUE),
                   vitd_p = sum(vit_d == 1, na.rm = TRUE)/n()
                   
                   # Previous infections
  ) 


# Modify column name so tables will join properly
colnames(table1b_overall)[1] <- 'ethnicity_16'

# Paste together with rbind
table1b <- rbind(table1b, table1b_overall)


# Table 1c - Columns = outcomes?

# Create summary table with denominators (number of participants or person-days?) and number of events
# For now, maybe can have BOTH?
outcome_summary_overall <- os_data %>% mutate(Category = 'All') %>% group_by(Category) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            ) 

outcome_summary_ethnicity <- os_data %>% group_by(ethnicity) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )
outcome_summary_ethnicity_16 <- os_data %>% group_by(ethnicity_16) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )

# Also for age strata, sex, comorbidities (diabetes, hypertension, obesity),
outcome_summary_age <- os_data %>% group_by(age_cat) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )

outcome_summary_sex <- os_data %>% group_by(sex) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )

outcome_summary_diabetes <- os_data %>% group_by(diabetes_type) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )

outcome_summary_hypertension <- os_data %>% group_by(hypertension_flag_char) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )

outcome_summary_obese <- os_data %>% group_by(obese) %>% 
  dplyr::summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            person_days_death = sum(death_covid_surv[,1]),
            cases_death = sum(death_covid_surv[,2]),
            ir_death = 1000*cases_death/person_days_death
            )


# Combine all into single table
# Need to give first column a generic name, like "Category" first
colnames(outcome_summary_ethnicity)[1] <- "Category" 
colnames(outcome_summary_ethnicity_16)[1] <- "Category"
colnames(outcome_summary_age)[1] <- "Category" 
colnames(outcome_summary_sex)[1] <- "Category" 
colnames(outcome_summary_diabetes)[1] <- "Category" 
colnames(outcome_summary_hypertension)[1] <- "Category" 
colnames(outcome_summary_obese)[1] <- "Category" 
table1c <- rbind(outcome_summary_overall, outcome_summary_ethnicity, outcome_summary_ethnicity_16, outcome_summary_age, outcome_summary_sex, outcome_summary_diabetes, outcome_summary_hypertension, outcome_summary_obese)


# Table 1d - Multiple Stratification of outcomes
# Groups are ethnicity/age/sex/diabetes/htn
table1d <- os_data %>% group_by(ethnicity, age_cat, sex, diabetes_flag, hypertension_flag) %>%
  dplyr::summarize(N = n(),
                   person_days_severe = sum(severe_covid_surv[,1]),
                   cases_severe = sum(severe_covid_surv[,2]),
                   ir_severe = 1000*cases_severe/person_days_severe,
                   person_days_death = sum(death_covid_surv[,1]),
                   cases_death = sum(death_covid_surv[,2]),
                   ir_death = 1000*cases_death/person_days_death) %>% 
  ungroup() %>%
  complete(ethnicity, age_cat, sex, diabetes_flag, hypertension_flag, fill = list(N = 0))

# Censor any sparse rows
# Saved as separate copy
table1d_cens <- table1d
table1d_cens$N <- as.character(table1d_cens$N)
table1d_cens$N <- ifelse(as.numeric(table1d_cens$N) <= 5, '<5', table1d_cens$N)


####################################
# ANALYSES - OUTCOME A - SEVERE COVID
####################################

# Unadjusted ratios, reported within each strata
# generate absolute rates of each outcome stratified by age, sex, ethnicity, and co-morbidity status
# "NCD group (diabetes, hypertension, obesity), and time period/wave of the pandemic."

# First, set reference ethnicities, ages, and co-morbid
os_data$ethnicity <- relevel(factor(os_data$ethnicity), ref = "White")
os_data$ethnicity_16 <- relevel(factor(os_data$ethnicity_16), ref = "White British")

# Ethnicity
severe_eth_un <- tidy(coxph(severe_covid_surv ~ as.factor(ethnicity), data = os_data), conf.int = TRUE)

# Ethnicity 16
severe_eth_16_un <- tidy(coxph(severe_covid_surv ~ as.factor(ethnicity_16), data = os_data), conf.int = TRUE)

# Age categories
severe_age_un <- tidy(coxph(severe_covid_surv ~ age_cat, data = os_data), conf.int = TRUE)

# Sex
severe_sex_un <- tidy(coxph(severe_covid_surv ~ sex, data = os_data), conf.int = TRUE)

# BMI Categories
severe_bmi_un <- tidy(coxph(severe_covid_surv ~ bmi_cat, data = os_data), conf.int = TRUE)

# Co-morbidity (diabetes, hypertension, obesity)
# How to code this? Since some people will have more than 1 of each...
# 3 separate models?
# Diabetes
severe_diab_un <- tidy(coxph(severe_covid_surv ~ diabetes_type, data = os_data), conf.int = TRUE)
# Hypertension
severe_htn_un <- tidy(coxph(severe_covid_surv ~ hypertension_flag_char, data = os_data), conf.int = TRUE)
# Obesity
severe_obese_un <- tidy(coxph(severe_covid_surv ~ obese, data = os_data), conf.int = TRUE)
# QUESTION: Use binary or all categories?


unadjusted_severe <- rbind(severe_eth_un, severe_eth_16_un, severe_age_un, severe_sex_un, severe_bmi_un, severe_diab_un, severe_htn_un, severe_obese_un)
# Remember to exponentiate estimate and conf int in final table!
unadjusted_severe[,c(2, 6, 7)] <- exp(unadjusted_severe[,c(2, 6, 7)])


# Now, adjusted analysis
#adjust_severe <- coxph(severe_covid_surv ~ ethnicity + sex + imdQ5 + bmi_cat + rcs(age, 3), data = os_data)
#prop_hazard_test <- cox.zph(adjust_severe)

# NOTE: cox.zph results in an error, likely due to sparsity in some cells
# Need to look through tables to determine where this is occurring

####################################
# SAVE AND EXPORT DATA
####################################

write.csv(table1a, file = './output/table1a_w2.csv')
write.csv(table1b, file = './output/table1b_w2.csv')
write.csv(table1c, file = './output/severe_incidence_w2.csv')
write.csv(table1d, file = './output/severe_incidence_strata_w2.csv')
write.csv(unadjusted_severe, file = './output/unadjusted_severe_w2.csv')