## HERMES - furosemide phenotype

#### Clone HeRmes repository

In [None]:
#system("git clone https://github.com/nicksunderland/heRmes.git")

#### Pushing changes to Github

In [None]:
# in ther terminal run
# cd /opt/notebooks
# dx download git_push.sh
# run bash git_push.sh 

#### Project & record ID

In [None]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
recordid  <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"

#### Libraries

In [4]:
library(glue)
library(data.table)
library(yaml)
if (!requireNamespace("bit64", quietly = TRUE)) {
  install.packages("bit64")
}
suppressMessages(library(bit64))
source("/opt/notebooks/heRmes/R/ukbb_extraction_utils.R")

## Read in extracted data

In [None]:
# get the extract config
config <- read_yaml("/opt/notebooks/heRmes/scripts/ukbb_extraction_config.yml")

# take the required tables for this phenotyping
config <- config[c("participant", "hesin", "hes_diag", "hes_oper", "gp_clinical", "gp_scripts", "olink_instance_0", "olink_instance_2", "olink_instance_3")]

# function to 

# read data 
data <- list()
for (i in seq_along(config)) {
    
    file_path <- file.path("/mnt/project", config[[i]][["output"]])
    
    if (!file.exists(file_path)) {
        cat(glue("Error:\nFile {file_path} not found, ",
                 "check the Monitor tab for the status of the Table-exporter ",
                 "and the 'hermes_data' folder. If this has finished try ",
                 "launching another Notebook session/instance (I'm not sure why ",
                 "the mounted /mnt/project/ file structure doesn't refresh when ",
                 "files are added externally."), sep="\n")
        stop("file not found error")
    }
    
    n <- names(config)[i]
    cat(glue('...{n}: {file_path}\n'), sep="\n")
    flush.console()
    data[[n]] <- fread(file_path)
    rename_ukbb_cols(data[[n]], col_config=config[[i]][["columns"]])
    
}

lapply(data, head, n = 5)

...participant: /mnt/project/hermes3_data/data_participant.tsv
...hesin: /mnt/project/hermes3_data/data_hesin.tsv
...hes_diag: /mnt/project/hermes3_data/data_hesin_diag.tsv
...hes_oper: /mnt/project/hermes3_data/data_hesin_oper.tsv
...gp_clinical: /mnt/project/hermes3_data/data_gp_clinical.tsv


## Data processing

In [6]:
codes <- fread(file.path("/opt/notebooks", "heRmes", "inst", "extdata", "hermes_furosemide_codes", "hermes_furosemide_codes.tsv"))
codes[, code := sub("^'(.+?)'$", "\\1", code)]
head(codes)

concept,code,code_type,description
<chr>,<chr>,<chr>,<chr>
Heart Failure,1076,ukbb_self_reported_illness,heart failure/pulmonary odema
Heart Failure,0205052AEAAAAAA,bnf,Sacubitril/Valsartan_Tab 49mg/51mg
Heart Failure,0205052AEAAABAB,bnf,Sacubitril/Valsartan_Tab 97mg/103mg
Heart Failure,0205052AEAAACAC,bnf,Sacubitril/Valsartan_Tab 24mg/26mg
Heart Failure,0205052AEBBAAAA,bnf,Entresto_Tab 49mg/51mg
Heart Failure,0205052AEBBABAB,bnf,Entresto_Tab 97mg/103mg


### Clean up the cohort data

In [7]:
ethnicity_codes <- list(
  white                 = 1,
  british               = 1001,
  white_black_caribbean = 2001,
  indian                = 3001,
  caribbean             = 4001,
  mixed                 = 2,
  irish                 = 1002,
  white_black_african   = 2002,
  pakistani             = 3002,
  african               = 4002,
  asian_or_asian_british= 3,
  any_other_white       = 1003,
  white_asian           = 2003,
  bangladeshi           = 3003,
  any_other_black       = 4003,
  black_or_black_british= 4,
  any_other_mixed       = 2004,
  any_other_asian       = 3004,
  chinese               = 5,
  other_ethnic_group    = 6)

data$participant[, ethnicity := fcoalesce(.SD), .SDcols = names(data$participant)[grepl("^ethnicity_[0-9]$", names(data$participant))]]

data$demog <- data$participant[, 
    list(eid               = eid,
         reason_lost_fu    = reason_lost_fu,
         age               = as.integer(age),
         sex               = factor(sex, levels = 0:1, labels = c("female", "male")),
         ethnicity         = factor(ethnicity, levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         ethnicity_group   = factor(sub("([0-9])00[0-9]", "\\1", ethnicity), levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         genetic_sex       = factor(genetic_sex, levels = 0:1, labels = c("female", "male")),
         genetic_ethnicity = factor(genetic_ethnicity, levels = 1, labels = c("caucasian")), 
         pc1               = pc1,
         pc2               = pc2,
         pc3               = pc3,
         pc4               = pc4,
         pc5               = pc5)]

### Self-report illness codes to long

In [8]:
self_rep_code_cols <- grep("self_rep_ill_[0-9]+",      names(data$participant), value = TRUE)
self_rep_year_cols <- grep("self_rep_ill_year_[0-9]+", names(data$participant), value = TRUE)
data$participant[, (self_rep_code_cols) := lapply(.SD, as.character), .SDcols = self_rep_code_cols]
data$participant[, (self_rep_year_cols) := lapply(.SD, as.numeric),   .SDcols = self_rep_year_cols]
data$self_illness <- data.table::melt(data$participant,
                                      id.vars = "eid",
                                      measure = patterns("self_rep_ill_[0-9]+", "self_rep_ill_year_[0-9]+"),
                                      variable.name = "element",
                                      value.name = c("code", "year"),
                                      na.rm = TRUE)
data$self_illness <- data$self_illness[year != -1 & year != -3] # unknown / prefer not to answer
data$self_illness[, `:=`(date      = lubridate::ymd(paste0(as.character(floor(year)), "-01-01")) + lubridate::days(as.integer(365.25 * (year - floor(year)))),
                         year      = NULL,
                         element   = NULL,
                         code      = as.character(code),
                         code_type = "ukbb_self_reported_illness")]

# check self report illness table
stopifnot("unable to parse dates for self-reported illness codes" = all(!is.na(data$self_illness$date)))
stopifnot("are you sure something happened before 1900?" = all(data$self_illness$date > as.Date("1900-01-01")))

### Self-report medication codes to long

In [9]:
self_med_code_cols <- grep("self_rep_med_[0-9]+", names(data$participant), value = TRUE)
data$participant[, (self_med_code_cols) := lapply(.SD, as.character), .SDcols = self_med_code_cols]
data$smed <- data.table::melt(data$participant,
                              id.vars = "eid",
                              measure = patterns("self_rep_med_[0-9]+"),
                              variable.name = "element",
                              value.name = c("code"),
                              na.rm = TRUE)
data$smed[, `:=`(date      = as.Date(NA_real_),
                 element   = NULL,
                 code      = as.character(code),
                 code_type = "ukbb_self_reported_medication")]

### OLINK codes to long

In [18]:
print(names(data)[grepl(".", names(data$participant))])
# olink_code_cols <- grep("self_rep_med_[0-9]+", names(data$participant), value = TRUE)
# data$participant[, (self_med_code_cols) := lapply(.SD, as.character), .SDcols = self_med_code_cols]
# data$smed <- data.table::melt(data$participant,
#                               id.vars = "eid",
#                               measure = patterns("self_rep_med_[0-9]+"),
#                               variable.name = "element",
#                               value.name = c("code"),
#                               na.rm = TRUE)
# data$smed[, `:=`(date      = as.Date(NA_real_),
#                  element   = NULL,
#                  code      = as.character(code),
#                  code_type = "ukbb_self_reported_medication")]

  [1] "eid"                      "reason_lost_fu"          
  [3] "sex"                      "age"                     
  [5] "sample_collection_date_1" "sample_collection_date_2"
  [7] "sample_collection_date_3" "sample_collection_date_4"
  [9] "ethnicity_1"              "ethnicity_2"             
 [11] "ethnicity_3"              "ethnicity_4"             
 [13] "genetic_sex"              "genetic_ethnicity"       
 [15] "pc1"                      "pc2"                     
 [17] "pc3"                      "pc4"                     
 [19] "pc5"                      "self_rep_ill_1"          
 [21] "self_rep_ill_2"           "self_rep_ill_3"          
 [23] "self_rep_ill_4"           "self_rep_ill_5"          
 [25] "self_rep_ill_6"           "self_rep_ill_7"          
 [27] "self_rep_ill_8"           "self_rep_ill_9"          
 [29] "self_rep_ill_10"          "self_rep_ill_11"         
 [31] "self_rep_ill_12"          "self_rep_ill_13"         
 [33] "self_rep_ill_14"          "self_r

### Inpatient diagnosis codes

In [10]:
data$hesin[is.na(epistart) | epistart == "", epistart := admidate]
data$hes_diag[data$hesin, date := as.Date(i.epistart), on = c("eid", "ins_index")]
data$hes_diag[diag_icd9 == "", diag_icd9 := NA_character_]
data$hes_diag[diag_icd10 == "", diag_icd10 := NA_character_]
data$hes_diag <- data.table::melt(data$hes_diag,
                                  id.vars = c("eid", "date"),
                                  measure.vars  = c("diag_icd9", "diag_icd10"),
                                  variable.name = "code_type",
                                  value.name = "code",
                                  na.rm = TRUE)
data$hes_diag[, code_type := data.table::fcase(code_type == "diag_icd9", "icd9",
                                               code_type == "diag_icd10", "icd10")]

### GP activity data

In [11]:
data$gp_clinical[read_2 == "", read_2 := NA_character_]
data$gp_clinical[read_3 == "", read_3 := NA_character_]
data$gp_clinical <- data.table::melt(data$gp_clinical,
                                     id.vars = c("eid", "date"),
                                     measure.vars  = c("read_2", "read_3"),
                                     variable.name = "code_type",
                                     value.name = "code",
                                     na.rm = TRUE)
data$gp_clinical[, code_type := data.table::fcase(code_type == "read_2", "read2",
                                                  code_type == "read_3", "read3")]

### GP medication data

In [12]:
data$gp_scripts[read_2 == "", read_2 := NA_character_]
data$gp_scripts[bnf_code == "", bnf_code := NA_character_]
data$gp_scripts[, dmd_code := as.character(dmd_code)]
data$gp_scripts <- data.table::melt(data$gp_scripts,
                                    id.vars = c("eid", "date"),
                                    measure.vars  = c("read_2", "bnf_code", "dmd_code"),
                                    variable.name = "code_type",
                                    value.name = "code",
                                    na.rm = TRUE)
data$gp_scripts[, code_type := data.table::fcase(code_type == "read_2",   "read_med",
                                                 code_type == "bnf_code", "bnf",
                                                 code_type == "dmd_code", "dmd")]

### Combine all codes and annotate concepts

In [13]:
combined <- rbind(data$self_illness, data$smed, data$hes_diag, data$gp_clinical, data$gp_scripts)
combined <- codes[combined, on = c("code" = "code", "code_type" = "code_type"), allow.cartesian = TRUE]
head(combined)

concept,code,code_type,description,eid,date
<chr>,<chr>,<chr>,<chr>,<int>,<date>
,1351,ukbb_self_reported_illness,,1000074,2006-07-02
,1086,ukbb_self_reported_illness,,1000194,1962-07-02
,1065,ukbb_self_reported_illness,,1000258,2005-03-15
,1436,ukbb_self_reported_illness,,1000280,1992-05-26
,1154,ukbb_self_reported_illness,,1000299,2007-07-02
,1226,ukbb_self_reported_illness,,1000314,1995-07-02


### Save all ICD-10 codes

In [15]:
all_icd10_codes <- combined[code_type=="icd10"][, .(all_icd10_codes = paste0(unique(code),collapse=";")), by=eid]
data$demog[all_icd10_codes, all_icd10_codes := i.all_icd10_codes, on="eid"]

### Annotate individuals with code:code_type counts

In [None]:
combined <- combined[!is.na(concept)] # discard non-annotated codes
combined[, concept := tolower(gsub(" ","_",concept))] # rename
d <- dcast(combined, eid ~ concept + code_type, value.var = "code",  
           fun.aggregate = function(x) paste(x, collapse = ";"))
d[, heart_failure := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^heart_failure", names(d))]]
d[, loop_diuretic := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^loop_diuretic", names(d))]]

### Combine with full cohort

In [None]:
base_cols <- c("eid", "age", "sex", "ethnicity", "ethnicity_group","genetic_sex", "genetic_ethnicity", paste0("pc",1:5), "all_icd10_codes")
measure_cols <- names(d)[names(d) != "eid"]
cohort <- d[data_files$demog[,mget(base_cols)], on="eid"]
#head(cohort)
cohort[, (measure_cols) := lapply(.SD, function(x) fifelse(is.na(x),"",x)), .SDcols=measure_cols]
setcolorder(cohort, base_cols)
head(cohort)
                                  
cohort[data_files$gp[,"eid"], any_gp := "1", on="eid"][is.na(any_gp), any_gp := ""]
cohort[data_files$med[,"eid"], any_gp_med := "1", on="eid"][is.na(any_gp_med), any_gp_med := ""]
cohort[data_files$diag[,"eid"], any_hes := "1", on="eid"][is.na(any_hes), any_hes := ""]

measure_cols <- names(cohort)[!names(cohort) %in% base_cols]
summary <- data.table (name = c("total", measure_cols), N = c(nrow(cohort), cohort[, .(sapply(.SD, function(x) sum(!is.na(x) & x!=""))), .SDcols = measure_cols]$V1))
head(summary,100)                                                          

fwrite(summary,
       file = "hermes_furosemide_phenotype_summary.tsv",
       sep  = "\t")
                                                                                              
# write out
fwrite(cohort,
       file = "hermes_furosemide_phenotypes.tsv.gz",
       sep  = "\t")

In [None]:
### Copy output to project

In [None]:
o <- system("dx upload hermes_furosemide_phenotype_summary.tsv hermes_furosemide_phenotypes.tsv.gz --destination hermes_furosemide_data", intern = TRUE)
cat(o, sep = "\n")