## HERMES - furosemide phenotype

#### Clone HeRmes repository

In [None]:
#system("git clone https://github.com/nicksunderland/heRmes.git")

#### Pushing changes to Github

In [None]:
# in ther terminal run
# cd /opt/notebooks
# dx download git_push.sh
# run bash git_push.sh 

#### Project & record ID

In [None]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
recordid  <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"

#### Libraries

In [None]:
library(glue)
library(data.table)
library(yaml)
if (!requireNamespace("bit64", quietly = TRUE)) {
  install.packages("bit64")
}
suppressMessages(library(bit64))
source("/opt/notebooks/heRmes/R/ukbb_extraction_utils.R")

## Read in extracted data

In [None]:
# get the extract config
config <- read_yaml("/opt/notebooks/heRmes/scripts/ukbb_extraction_config.yml")

# take the required tables for this phenotyping
    config <- config[c("participant", "hesin", "hes_diag", "hes_oper", "gp_clinical", "gp_scripts", "olink_instance_0", "olink_instance_2", "olink_instance_3")]

# function to 

# read data 
data <- list()
for (i in seq_along(config)) {
    
    file_path <- file.path("/mnt/project", config[[i]][["output"]])
    
    if (!file.exists(file_path)) {
        cat(glue("Error:\nFile {file_path} not found, ",
                 "check the Monitor tab for the status of the Table-exporter ",
                 "and the 'hermes_data' folder. If this has finished try ",
                 "launching another Notebook session/instance (I'm not sure why ",
                 "the mounted /mnt/project/ file structure doesn't refresh when ",
                 "files are added externally."), sep="\n")
        stop("file not found error")
    }
    
    n <- names(config)[i]
    cat(glue('...{n}: {file_path}\n'), sep="\n")
    flush.console()
    data[[n]] <- fread(file_path)
    rename_ukbb_cols(data[[n]], col_config=config[[i]][["columns"]])
    
}

lapply(data, head, n = 5)

## Data processing

In [None]:
codes <- fread(file.path("/opt/notebooks", "heRmes", "inst", "extdata", "hermes_furosemide_codes", "hermes_furosemide_codes.tsv"))
codes[, code := sub("^'(.+?)'$", "\\1", code)]
head(codes)

### Clean up the cohort data

In [None]:
ethnicity_codes <- list(
  white                 = 1,
  british               = 1001,
  white_black_caribbean = 2001,
  indian                = 3001,
  caribbean             = 4001,
  mixed                 = 2,
  irish                 = 1002,
  white_black_african   = 2002,
  pakistani             = 3002,
  african               = 4002,
  asian_or_asian_british= 3,
  any_other_white       = 1003,
  white_asian           = 2003,
  bangladeshi           = 3003,
  any_other_black       = 4003,
  black_or_black_british= 4,
  any_other_mixed       = 2004,
  any_other_asian       = 3004,
  chinese               = 5,
  other_ethnic_group    = 6)

data$participant[, ethnicity := fcoalesce(.SD), .SDcols = names(data$participant)[grepl("^ethnicity_[0-9]$", names(data$participant))]]

data$demog <- data$participant[, 
    list(eid               = eid,
         reason_lost_fu    = reason_lost_fu,
         assessment_age    = as.integer(assessment_age_1),
         assessment_date   = as.Date(assessment_date_1),
         sex               = factor(sex, levels = 0:1, labels = c("female", "male")),
         weight            = as.numeric(weight_1), 
         height            = as.numeric(height_1), 
         bmi               = as.numeric(bmi_1),
         ethnicity         = factor(ethnicity, levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         ethnicity_group   = factor(sub("([0-9])00[0-9]", "\\1", ethnicity), levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         genetic_sex       = factor(genetic_sex, levels = 0:1, labels = c("female", "male")),
         genetic_ethnicity = factor(genetic_ethnicity, levels = 1, labels = c("caucasian")), 
         pc1               = pc1,
         pc2               = pc2,
         pc3               = pc3,
         pc4               = pc4,
         pc5               = pc5)]

### Self-report illness codes to long

In [None]:
self_rep_code_cols <- grep("self_rep_ill_[0-9]+",      names(data$participant), value = TRUE)
self_rep_year_cols <- grep("self_rep_ill_year_[0-9]+", names(data$participant), value = TRUE)
data$participant[, (self_rep_code_cols) := lapply(.SD, as.character), .SDcols = self_rep_code_cols]
data$participant[, (self_rep_year_cols) := lapply(.SD, as.numeric),   .SDcols = self_rep_year_cols]
data$self_illness <- data.table::melt(data$participant,
                                      id.vars = "eid",
                                      measure = patterns("self_rep_ill_[0-9]+", "self_rep_ill_year_[0-9]+"),
                                      variable.name = "element",
                                      value.name = c("code", "year"),
                                      na.rm = TRUE)
data$self_illness <- data$self_illness[year != -1 & year != -3] # unknown / prefer not to answer
data$self_illness[, `:=`(date      = lubridate::ymd(paste0(as.character(floor(year)), "-01-01")) + lubridate::days(as.integer(365.25 * (year - floor(year)))),
                         year      = NULL,
                         element   = NULL,
                         code      = as.character(code),
                         code_type = "ukbb_self_reported_illness")]

# check self report illness table
stopifnot("unable to parse dates for self-reported illness codes" = all(!is.na(data$self_illness$date)))
stopifnot("are you sure something happened before 1900?" = all(data$self_illness$date > as.Date("1900-01-01")))

### Self-report medication codes to long

In [None]:
self_med_code_cols <- grep("self_rep_med_[0-9]+", names(data$participant), value = TRUE)
data$participant[, (self_med_code_cols) := lapply(.SD, as.character), .SDcols = self_med_code_cols]
data$smed <- data.table::melt(data$participant,
                              id.vars = "eid",
                              measure = patterns("self_rep_med_[0-9]+"),
                              variable.name = "element",
                              value.name = c("code"),
                              na.rm = TRUE)
data$smed[, `:=`(date      = as.Date(NA_real_),
                 element   = NULL,
                 code      = as.character(code),
                 code_type = "ukbb_self_reported_medication")]

### OLINK codes to long

In [None]:
data$olink <- rbindlist(list(
    data$olink_instance_0[data$participant, date := i.sample_collection_date_1, on="eid"] |> melt(id.vars=c("eid","date"), measure.vars=c("ntprobnp","glp1r"), value.name="value", variable.name="measure"), 
    data$olink_instance_2[data$participant, date := i.sample_collection_date_3, on="eid"] |> melt(id.vars=c("eid","date"), measure.vars=c("ntprobnp"), value.name="value", variable.name="measure"), 
    data$olink_instance_3[data$participant, date := i.sample_collection_date_4, on="eid"] |> melt(id.vars=c("eid","date"), measure.vars=c("ntprobnp"), value.name="value", variable.name="measure")
)) 
data$olink <- data$olink[, .(eid, date=as.Date(date), code=paste(measure, value, sep="="), code_type = "olink")] 

### Blood tests to long

In [None]:
data$bloods <- rbind(
    data.table::melt(data$participant, 
                     id.vars = "eid", 
                     measure = patterns(code="creatinine_(1|2)", date="sample_collection_date_(1|2)"),
                     na.rm = TRUE)[, code := paste0("creatinine=", code)],
    data.table::melt(data$participant, 
                     id.vars = "eid", 
                     measure = patterns(code="cystatin_c_(1|2)", date="sample_collection_date_(1|2)"),
                     na.rm = TRUE)[, code := paste0("cystatin_c=", code)]
)
data$bloods[, `:=`(variable  = NULL,
                   date      = as.Date(date),
                   code_type = "blood_tests")]

### Inpatient diagnosis codes

In [None]:
data$hesin[is.na(epistart) | epistart == "", epistart := admidate]
data$hes_diag[data$hesin, date := as.Date(i.epistart), on = c("eid", "ins_index")]
data$hes_diag[diag_icd9 == "", diag_icd9 := NA_character_]
data$hes_diag[diag_icd10 == "", diag_icd10 := NA_character_]
data$hes_diag <- data.table::melt(data$hes_diag,
                                  id.vars = c("eid", "date"),
                                  measure.vars  = c("diag_icd9", "diag_icd10"),
                                  variable.name = "code_type",
                                  value.name = "code",
                                  na.rm = TRUE)
data$hes_diag[, code_type := data.table::fcase(code_type == "diag_icd9", "icd9",
                                               code_type == "diag_icd10", "icd10")]

### GP activity data

In [None]:
data$gp_clinical[read_2 == "", read_2 := NA_character_]
data$gp_clinical[read_3 == "", read_3 := NA_character_]
data$gp_clinical <- data.table::melt(data$gp_clinical,
                                     id.vars = c("eid", "date"),
                                     measure.vars  = c("read_2", "read_3"),
                                     variable.name = "code_type",
                                     value.name = "code",
                                     na.rm = TRUE)
data$gp_clinical[, code_type := data.table::fcase(code_type == "read_2", "read2",
                                                  code_type == "read_3", "read3")]

### GP medication data

In [None]:
data$gp_scripts[read_2 == "", read_2 := NA_character_]
data$gp_scripts[bnf_code == "", bnf_code := NA_character_]
data$gp_scripts[, dmd_code := as.character(dmd_code)]
data$gp_scripts <- data.table::melt(data$gp_scripts,
                                    id.vars = c("eid", "date"),
                                    measure.vars  = c("read_2", "bnf_code", "dmd_code"),
                                    variable.name = "code_type",
                                    value.name = "code",
                                    na.rm = TRUE)
data$gp_scripts[, code_type := data.table::fcase(code_type == "read_2",   "read_med",
                                                 code_type == "bnf_code", "bnf",
                                                 code_type == "dmd_code", "dmd")]

### Combine all codes and annotate concepts

In [None]:
combined <- rbind(data$self_illness, data$smed, data$hes_diag, data$gp_clinical, data$gp_scripts, data$bloods, data$olink)
combined <- codes[combined, on = c("code" = "code", "code_type" = "code_type"), allow.cartesian = TRUE]
head(combined)

### Save all ICD-10 codes

In [None]:
library(data.table)
all_icd10_codes <- combined[code_type=="icd10"][, .(all_icd10_codes = paste0(unique(code),collapse=";")), by=eid]
data$demog[all_icd10_codes, all_icd10_codes := i.all_icd10_codes, on="eid"]

### Annotate individuals with code:code_type counts

In [None]:
combined[code_type %in% c("olink", "blood_tests"), c("concept", "code") := tstrsplit(code, "=", fixed = TRUE)]
combined <- combined[!is.na(concept)] # discard non-annotated codes
combined[, concept := tolower(gsub(" ","_",concept))] # rename
combined[, code := paste0(date,"#",code)]
d <- dcast(combined, eid ~ concept + code_type, value.var = "code",  
           fun.aggregate = function(x) paste(x, collapse = ";"))
d[, heart_failure := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^heart_failure", names(d))]]
d[, loop_diuretic := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^loop_diuretic", names(d))]]

### Combine with full cohort

In [None]:
base_cols <- c("eid", "assessment_age", "assessment_date", "sex", "ethnicity", "ethnicity_group","genetic_sex", "genetic_ethnicity", paste0("pc",1:5), "all_icd10_codes")
measure_cols <- names(d)[names(d) != "eid"]
cohort <- d[data$demog[,mget(base_cols)], on="eid"]
#head(cohort)
cohort[, (measure_cols) := lapply(.SD, function(x) fifelse(is.na(x),"",x)), .SDcols=measure_cols]
setcolorder(cohort, base_cols)
head(cohort)
                                  
cohort[data$gp_clinical[,"eid"], any_gp := "1", on="eid"][is.na(any_gp), any_gp := ""]
cohort[data$gp_scripts[,"eid"], any_gp_med := "1", on="eid"][is.na(any_gp_med), any_gp_med := ""]
cohort[data$hes_diag[,"eid"], any_hes := "1", on="eid"][is.na(any_hes), any_hes := ""]

measure_cols <- names(cohort)[!names(cohort) %in% base_cols]
summary <- data.table (name = c("total", measure_cols), N = c(nrow(cohort), cohort[, .(sapply(.SD, function(x) sum(!is.na(x) & x!=""))), .SDcols = measure_cols]$V1))
head(summary,100)                                                          

fwrite(summary,
       file = "/opt/notebooks/hermes_furosemide_phenotype_summary.tsv",
       sep  = "\t")
                                                                                              
# write out
fwrite(cohort,
       file = "/opt/notebooks/hermes_furosemide_phenotypes.tsv.gz",
       sep  = "\t")

### Copy output to project

In [None]:
o <- system("dx upload /opt/notebooks/hermes_furosemide_phenotype_summary.tsv /opt/notebooks/hermes_furosemide_phenotypes.tsv.gz --destination hermes_furosemide_data", intern = TRUE)
cat(o, sep = "\n")