analysis/analysis_wave1_long.R

# Import data
os_data <- read.csv('./output/input_w1.csv')
# WAVE 1 DATA
# On local computer, filepath is ~/Documents/GitHub/DISECT_UK_India_COVID/output

# Load Libraries
library(rms)
library(survival)
library(broom)
library(tidyverse)

####################################
# DATA PROCESSING
####################################

# To avoid complications later, may want to replace all empty strings with NA
# Otherwise, many NAs could to unnoticed
os_data[os_data == ""] <- NA

# Filter out bad records (missing sex, age, or IMD)
os_data <- os_data %>% filter(!is.na(sex), !is.na(age), !is.na(imd))

# Correct labels for ethnicity variables
os_data <- os_data %>% mutate(ethnicity = case_when(ethnicity == 1 ~ "White",
                                         ethnicity == 2 ~ "South Asian",
                                         ethnicity == 3 ~ "Black",
                                         ethnicity == 4 ~ "Mixed", 
                                         ethnicity == 5 ~ "Other",
                                         is.na(ethnicity) ~ "Unknown",
                                         TRUE ~ "Unknown"),
                   ethnicity_16 = case_when(ethnicity_16 == 1 ~ "White British",
                                            ethnicity_16 == 2 ~ "White Irish",
                                            ethnicity_16 == 3 ~ "Other White",
                                            ethnicity_16 == 4 ~ "White + Caribbean",
                                            ethnicity_16 == 5 ~ "White + African",
                                            ethnicity_16 == 6 ~ "White + Asian",
                                            ethnicity_16 == 7 ~ "Other mixed",
                                            ethnicity_16 == 8 ~ "Indian",
                                            ethnicity_16 == 9 ~ "Pakistani",
                                            ethnicity_16 == 10 ~ "Bangladeshi",
                                            ethnicity_16 == 11 ~ "Other Asian",
                                            ethnicity_16 == 12 ~ "Caribbean",
                                            ethnicity_16 == 13 ~ "African",
                                            ethnicity_16 == 14 ~ "Other Black",
                                            ethnicity_16 == 15 ~ "Chinese",
                                            ethnicity_16 == 16 ~ "Other",
                                            is.na(ethnicity_16) ~ "Unknown",
                                            TRUE ~ "Unknown")) 

# Binarise hypertension
os_data$hypertension_flag <- ifelse(!is.na(os_data$hypertension), 1, 0)
os_data$hypertension_flag_char <- ifelse(!is.na(os_data$hypertension), 'Hypertension', 'No hypertension') # For easier tables
# Binarise chronic cardiac disease
os_data$cardiac_flag <- ifelse(!is.na(os_data$chronic_cardiac_disease), 1, 0)
# Binarise chronic kidney disease
os_data$ckd_flag <- ifelse(!is.na(os_data$ckd), 1, 0)
# Binarise statin
os_data$statin_flag <- ifelse(!is.na(os_data$statin), 1, 0)
# Binarise obesity
os_data$obese <- ifelse((os_data$ethnicity == 'South Asian' & os_data$bmi >= 27.5) | os_data$bmi >= 30, 'Obese', 'Not obese')


# Recall our objectives
# A.To describe the incidence rate of severe COVID-19 (hospitalization, death, and both combined) according to strata of: age group, sex, ethnicity group, NCD group (diabetes, hypertension, obesity), and time period/wave of the pandemic.
# B.To describe the incidence rate of Long COVID according to strata of: age group, sex, ethnicity group, NCD group (diabetes, hypertension, obesity), and time period/wave of the pandemic.

# Time periods of interest are defined as follows:
# 23rd March 2020 to 31st October 2020 (Wave 1),
# 1st November 2020 to 31st March 2021 (wave 2),
# 1st April 2021 to 31st November 2021 (easing restrictions and introduction of widespread vaccination)
# 1st December 2021 to 30th April 2022 (Omicron wave).

# Age reported in 5 year bands for stratification
os_data <- os_data %>% mutate(age_cat = case_when(age < 25 ~ "18-24",
                                                  age < 30 & age >= 25 ~ "25-29",
                                                  age < 35 & age >= 30 ~ "30-34",
                                                  age < 40 & age >= 35 ~ "35-39",
                                                  age < 45 & age >= 40 ~ "40-44",
                                                  age < 50 & age >= 45 ~ "45-49",
                                                  age < 55 & age >= 50 ~ "50-54",
                                                  age < 60 & age >= 55 ~ "55-59",
                                                  age < 65 & age >= 60 ~ "60-64",
                                                  age < 70 & age >= 65 ~ "65-69",
                                                  age < 75 & age >= 70 ~ "70-74",
                                                  age < 80 & age >= 75 ~ "75-79",
                                                  age < 85 & age >= 80 ~ "80-84",
                                                  age >= 85 ~ "85+"))

####################################
# CREATE OUTCOME VARIABLES
####################################

# OUTCOME A: Severe Covid
# Two criteria are:
# COVID-19 hospitalization (defined as a COVID-19 ICD-10 code in the primary diagnosis field, ascertained from SUS data)
# COVID-19 related death defined as a COVID-19 ICD-10 code anywhere on the death certificate (ascertained from ONS death certificate data). 

# Set deregistration date, death, long covid date, or TPP linkage as outcome - whichever is EARLIEST
# Death date has DAY included, long covid and de-registration do NOT
# Add 15 as date to these
# QUESTION - Better to use 1st or 15th? Looks like Rohini's code used 15
# QUESTION - Where is last TPP linkage date?
os_data$covid_hosp_date <- as.Date(os_data$covid_admission_date,  format = "%Y-%m-%d")
#os_data$covid_death_date <- as.Date(paste(os_data$long_covid_date,"-15",sep=""),  format = "%Y-%m-%d")
# covid_death_date variable is binary flag for now, use death date + flag to determine if outcome happened
os_data$dereg_date <- as.Date(paste(os_data$dereg_date,"-15",sep=""),  format = "%Y-%m-%d")
os_data$died_date_ons <- as.Date(os_data$died_date_ons, format = "%Y-%m-%d")

# Give everyone an "End of wave" date, to use in calculating the min
os_data$wave_end <- rep(as.Date("2020-10-31", format = "%Y-%m-%d"), nrow(os_data))

# Determine minimum of these dates
os_data <- os_data %>% rowwise() %>% 
  mutate(severe_covid_outcome_date = as.Date(min(as.numeric(covid_hosp_date), as.numeric(dereg_date), as.numeric(died_date_ons), as.numeric(wave_end), na.rm = TRUE), format = "%Y-%m-%d", origin = "1970-01-01"))

# And create event flag
os_data$severe_covid_flag <- ifelse(!is.na(os_data$covid_hosp_date) | os_data$died_ons_covid_flag_any == 1, 1, 0)

# Generate survival object for Cox analyses
os_data$severe_covid_surv <- survival::Surv(as.numeric(os_data$severe_covid_outcome_date)-rep(as.numeric(as.Date("2020-03-01", format = "%Y-%m-%d")), nrow(os_data)), 
                                          os_data$severe_covid_flag)


# OUTCOME B: Long Covid

# Set deregistration date, death, long covid date, or TPP linkage as outcome - whichever is EARLIEST
# Death date has DAY included, long covid and de-registration do NOT
# Add 15 as date to these
os_data$long_covid_date <- as.Date(paste(os_data$long_covid_date,"-15",sep=""),  format = "%Y-%m-%d")

# Determine minimum of these dates
os_data <- os_data %>% rowwise() %>% 
  mutate(long_covid_outcome_date = as.Date(min(as.numeric(long_covid_date), as.numeric(dereg_date), as.numeric(died_date_ons), as.numeric(wave_end), na.rm = TRUE), format = "%Y-%m-%d", origin = "1970-01-01"))

# And create event flag
os_data$long_covid_flag <- ifelse(!is.na(os_data$long_covid_date), 1, 0)

# Generate survival object for Cox analyses
os_data$long_covid_surv <- survival::Surv(as.numeric(os_data$long_covid_outcome_date)-rep(as.numeric(as.Date("2020-03-01", format = "%Y-%m-%d")), nrow(os_data)), 
                                          os_data$long_covid_flag)


####################################
# SUMMARY TABLES
####################################


# TABLE 1s
# We will describe the proportion of individuals within each ethnicity category and outcome category, and their baseline covariate status at the start of each study period
# QUESTION: What is meant by start of each study period? Different waves?

# Table 1a - Columns = ethnicity (5)
table1a <- os_data %>% group_by(ethnicity) %>% 
  summarize(N = n(),
            mean_age = mean(age),
            sd_age = sd(age),
            male = sum(sex == 'M', na.rm = TRUE),
            male_p = sum(sex == 'M', na.rm = TRUE)/n(),
            female = sum(sex == 'F', na.rm = TRUE),
            female_p = sum(sex == 'F', na.rm = TRUE)/n(),
            mean_bmi = mean(bmi, na.rm = TRUE),
            sd_bmi = sd(bmi, na.rm = TRUE),
            # QUESTION - include BMI categories as well?
            
            # Smoking
            current_smoke = sum(smoking_status == 'S', na.rm = TRUE),
            current_smoke_p = sum(smoking_status == 'S', na.rm = TRUE)/N,
            ever_smoke = sum(smoking_status == 'E', na.rm = TRUE),
            ever_smoke_p = sum(smoking_status == 'E', na.rm = TRUE)/N,
            non_smoke = sum(smoking_status == 'N', na.rm = TRUE),
            non_smoke_p = sum(smoking_status == 'N', na.rm = TRUE)/N,
            missing_smoke = sum(smoking_status == 'M' | is.na(smoking_status)),
            missing_smoke_p = sum(smoking_status == 'M' | is.na(smoking_status))/N,
            # QUESTION - Missing as its own category or combine with non-smoker?
            
            # IMD
            imd_1 = sum(imd == 100),
            imd_1_p = sum(imd == 100)/N,
            imd_2 = sum(imd == 200),
            imd_2_p = sum(imd == 200)/N,
            imd_3 = sum(imd == 300),
            imd_3_p = sum(imd == 300)/N,
            # QUESTION: Why only 3 possible values?
            # Geography
            
            # Eligibility for shielding
            eligible_shield = sum(shielding == 1, na.rm = TRUE),
            eligible_shield = sum(shielding == 1, na.rm = TRUE)/N,
            
            # Co-morbidities: T1DM, T2DM, hypertension, CVD, CKD
            t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE),
            t1dm = sum(diabetes_type == 'T1DM', na.rm = TRUE)/N,
            t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE),
            t2dm = sum(diabetes_type == 'T2DM', na.rm = TRUE)/N,
            dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE),
            dm_unknown = sum(diabetes_type == 'UNKNOWN_DM', na.rm = TRUE)/N,
            hypertens = sum(hypertension_flag == 1),
            hypertens_p = sum(hypertension_flag == 1)/N,
            chronic_cardiac = sum(cardiac_flag == 1),
            chronic_cardiac_p = sum(cardiac_flag == 1)/N,
            chronic_kidney = sum(ckd_flag == 1),
            chronic_kidney_p = sum(ckd_flag == 1)/N,
            
            # Medications: antidiabetic, BP lowering, lipid lowering
            bp_meds = sum(combination_bp_meds > 0, na.rm = TRUE),
            bp_meds_p = sum(combination_bp_meds > 0, na.rm = TRUE)/N,
            statins = sum(statin_flag == 1),
            statins_p = sum(statin_flag == 1)/N,
            # QUESTION: Binary or report number of medications?
            
            # Date of all previous COVID-19 diagnoses in primary care 
            # first_positive_test_date variable, not sure how to report in table?
            
            # Date of all COVID-19 vaccinations
            
            # Household composition: Household size (number of people living in a household), generational composition (single generation, two generation, or multi-generation)
            hh_size_mean = mean(hh_size, na.rm = TRUE),
            hh_size_sd = sd(hh_size, na.rm = TRUE),
            # QUESTION: Use continuous or categorical?

            # Care home residents
            care_home = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE),
            care_home_p = sum(care_home_type %in% c('PC', 'PN', 'PS'), na.rm = TRUE)/N
            # QUESTION: Didn't know what different codes meant, unfortunately, so just aggregated... Can correct as needed
            
            # Vitamin D
            
            # Previous infections
            ) 

# + Overall column
#os_data %>% 
#  summarize(N = n())
# Paste together with rbind, then rotate
# Note, probably should also round the decimals. Otherwise, very difficult to read.

# QUESTION: Remove care home residents at which step?

# Table 1b - Columns = ethnicity (16)
# Replicate code for 1a once complete
  
  
# Table 1c - Columns = outcomes?

# Create summary table with denominators (number of participants or person-days?) and number of events
# For now, maybe can have BOTH?
outcome_summary_overall <- os_data %>% mutate(Category = 'All') %>% group_by(Category) %>% 
            summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long) 

outcome_summary_ethnicity <- os_data %>% group_by(ethnicity) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)
outcome_summary_ethnicity_16 <- os_data %>% group_by(ethnicity_16) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)
# Also for age strata, sex, comorbidities (diabetes, hypertension, obesity),
outcome_summary_age <- os_data %>% group_by(age_cat) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)

outcome_summary_sex <- os_data %>% group_by(sex) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)

outcome_summary_diabetes <- os_data %>% group_by(diabetes_type) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)

outcome_summary_hypertension <- os_data %>% group_by(hypertension_flag_char) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)

outcome_summary_obese <- os_data %>% group_by(obese) %>% 
  summarize(N = n(),
            person_days_severe = sum(severe_covid_surv[,1]),
            cases_severe = sum(severe_covid_surv[,2]),
            ir_severe = 1000*cases_severe/person_days_severe,
            #cases_death = sum(),
            # QUESTION: Want to also note # of deaths from covid? If so, should be only deaths not associated with hospitalisation?
            person_days_long = sum(long_covid_surv[,1]),
            cases_long = sum(long_covid_surv[,2]),
            ir_long = 1000*cases_long/person_days_long)
# AND PANDEMIC WAVE


# Combine all into single table
# Need to give first column a generic name, like "Category" first
colnames(outcome_summary_ethnicity)[1] <- "Category" 
colnames(outcome_summary_ethnicity_16)[1] <- "Category"
colnames(outcome_summary_age)[1] <- "Category" 
colnames(outcome_summary_sex)[1] <- "Category" 
colnames(outcome_summary_diabetes)[1] <- "Category" 
colnames(outcome_summary_hypertension)[1] <- "Category" 
colnames(outcome_summary_obese)[1] <- "Category" 
table1c <- rbind(outcome_summary_overall, outcome_summary_ethnicity, outcome_summary_ethnicity_16, outcome_summary_age, outcome_summary_sex, outcome_summary_diabetes, outcome_summary_hypertension, outcome_summary_obese)


####################################
# ANALYSES - OUTCOME A - SEVERE COVID
####################################

# Unadjusted ratios, reported within each strata
# generate absolute rates of each outcome stratified by age, sex, ethnicity, and co-morbidity status
# "NCD group (diabetes, hypertension, obesity), and time period/wave of the pandemic."

# First, set reference ethnicities, ages, and co-morbid
os_data$ethnicity <- relevel(factor(os_data$ethnicity), ref = "White")
os_data$ethnicity_16 <- relevel(factor(os_data$ethnicity_16), ref = "White British")

# Ethnicity
severe_eth_un <- tidy(coxph(severe_covid_surv ~ as.factor(ethnicity), data = os_data), conf.int = TRUE)

# Ethnicity 16
severe_eth_16_un <- tidy(coxph(severe_covid_surv ~ as.factor(ethnicity_16), data = os_data), conf.int = TRUE)

# Age categories
severe_age_un <- tidy(coxph(severe_covid_surv ~ age_cat, data = os_data), conf.int = TRUE)

# Sex
severe_sex_un <- tidy(coxph(severe_covid_surv ~ sex, data = os_data), conf.int = TRUE)

# Co-morbidity (diabetes, hypertension, obesity)
# How to code this? Since some people will have more than 1 of each...
# 3 separate models?
# Diabetes
severe_diab_un <- tidy(coxph(severe_covid_surv ~ diabetes_type, data = os_data), conf.int = TRUE)
# Hypertension
severe_htn_un <- tidy(coxph(severe_covid_surv ~ hypertension_flag_char, data = os_data), conf.int = TRUE)
# Obesity
severe_obese_un <- tidy(coxph(severe_covid_surv ~ obese, data = os_data), conf.int = TRUE)
# QUESTION: Use binary or all categories?


unadjusted_severe <- rbind(severe_eth_un, severe_eth_16_un, severe_age_un, severe_sex_un, severe_diab_un, severe_htn_un, severe_obese_un)
# Remember to exponentiate estimate and conf int in final table!
unadjusted_severe[,c(2, 6, 7)] <- exp(unadjusted_severe[,c(2, 6, 7)])


# Now, adjusted analysis
adjust_severe <- coxph(severe_covid_surv ~ ethnicity + sex + imd + bmi + rcs(age, 3), data = os_data)
prop_hazard_test <- cox.zph(adjust_severe)

####################################
# ANALYSES - OUTCOME B - LONG COVID
####################################


# Unadjusted ratios, reported within each strata
# generate absolute rates of each outcome stratified by age, sex, ethnicity, and co-morbidity status 

# QUESTION: Also stratify by stp as in Rohini's paper?

# Ethnicity
long_eth_un <- tidy(coxph(long_covid_surv ~ as.factor(ethnicity), data = os_data), conf.int = TRUE)

# Ethnicity 16
long_eth_16_un <- tidy(coxph(long_covid_surv ~ as.factor(ethnicity_16), data = os_data), conf.int = TRUE)

# Age categories
long_age_un <- tidy(coxph(long_covid_surv ~ age_cat, data = os_data), conf.int = TRUE)

# Sex
long_sex_un <- tidy(coxph(long_covid_surv ~ sex, data = os_data), conf.int = TRUE)

# Co-morbidity (diabetes, hypertension, obesity)
# How to code this? Since some people will have more than 1 of each...
# 3 separate models?
# Diabetes
long_diab_un <- tidy(coxph(long_covid_surv ~ diabetes_type, data = os_data), conf.int = TRUE)
# Hypertension
long_htn_un <- tidy(coxph(long_covid_surv ~ hypertension_flag_char, data = os_data), conf.int = TRUE)
# Obesity
long_obese_un <- tidy(coxph(long_covid_surv ~ obese, data = os_data), conf.int = TRUE)
# QUESTION: Use binary or all categories?


unadjusted_long <- rbind(long_eth_un, long_eth_16_un, long_age_un, long_sex_un, long_diab_un, long_htn_un, long_obese_un)
# Remember to exponentiate estimate and conf int in final table!
unadjusted_long[,c(2, 6, 7)] <- exp(unadjusted_long[,c(2, 6, 7)])


####################################
# SAVE AND EXPORT DATA
####################################

write.csv(table1a, file = './output/table1a_w1.csv')
write.csv(table1c, file = './output/table1c_w1.csv')
write.csv(unadjusted_severe, file = './output/unadjusted_severe_w1.csv')
write.csv(unadjusted_long, file = './output/unadjusted_long_w1.csv')