## HERMES - furosemide phenotype

#### Clone HeRmes repository

In [None]:
#system("git clone https://github.com/nicksunderland/heRmes.git")

#### Pushing changes to Github

In [None]:
# in ther terminal run
# cd /opt/notebooks
# dx download git_push.sh
# run bash git_push.sh 

#### Project & record ID

In [None]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
recordid  <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"

#### Libraries

In [None]:
library(glue)
library(data.table)
library(yaml)
if (!requireNamespace("bit64", quietly = TRUE)) {
  install.packages("bit64")
}
suppressMessages(library(bit64))
source("/opt/notebooks/heRmes/R/ukbb_extraction_utils.R")

## Read in extracted data

In [43]:
# get the extract config
config <- read_yaml("/opt/notebooks/heRmes/scripts/ukbb_extraction_config.yml")

# take the required tables for this phenotyping
config <- config[c("participant", "hesin", "hes_diag", "hes_oper", "gp_clinical", "gp_scripts", "olink_instance_0", "olink_instance_2", "olink_instance_3")]

# function to 

# read data 
data <- list()
for (i in seq_along(config)) {
    
    file_path <- file.path("/mnt/project", config[[i]][["output"]])
    
    if (!file.exists(file_path)) {
        cat(glue("Error:\nFile {file_path} not found, ",
                 "check the Monitor tab for the status of the Table-exporter ",
                 "and the 'hermes_data' folder. If this has finished try ",
                 "launching another Notebook session/instance (I'm not sure why ",
                 "the mounted /mnt/project/ file structure doesn't refresh when ",
                 "files are added externally."), sep="\n")
        stop("file not found error")
    }
    
    n <- names(config)[i]
    cat(glue('...{n}: {file_path}\n'), sep="\n")
    flush.console()
    data[[n]] <- fread(file_path)
    rename_ukbb_cols(data[[n]], col_config=config[[i]][["columns"]])
    
}

lapply(data, head, n = 5)

...participant: /mnt/project/hermes3_data/data_participant.tsv
...hesin: /mnt/project/hermes3_data/data_hesin.tsv
...hes_diag: /mnt/project/hermes3_data/data_hesin_diag.tsv
...hes_oper: /mnt/project/hermes3_data/data_hesin_oper.tsv
...gp_clinical: /mnt/project/hermes3_data/data_gp_clinical.tsv
...gp_scripts: /mnt/project/hermes3_data/data_gp_scripts.tsv




...olink_instance_0: /mnt/project/hermes3_data/data_olink_instance_0.tsv
...olink_instance_2: /mnt/project/hermes3_data/data_olink_instance_2.tsv
...olink_instance_3: /mnt/project/hermes3_data/data_olink_instance_3.tsv


eid,reason_lost_fu,sex,age,sample_collection_date_1,sample_collection_date_2,sample_collection_date_3,sample_collection_date_4,ethnicity_1,ethnicity_2,⋯,self_rep_med_183,self_rep_med_184,self_rep_med_185,self_rep_med_186,self_rep_med_187,self_rep_med_188,self_rep_med_189,self_rep_med_190,self_rep_med_191,self_rep_med_192
<int>,<int>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<int>,<int>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1000074,,0,42,2009-03-20 16:49:16,,,,1001,,⋯,,,,,,,,,,
1000194,,1,60,2009-06-27 11:07:49,,2019-07-03 15:17:56,,1001,,⋯,,,,,,,,,,
1000258,,1,56,2008-05-02 18:19:27,,,,1001,,⋯,,,,,,,,,,
1000280,,1,44,2008-04-28 18:36:55,,2016-09-02 10:37:12,,1001,,⋯,,,,,,,,,,
1000299,,0,45,2009-06-13 15:54:13,,,,1001,,⋯,,,,,,,,,,

eid,ins_index,epistart,admidate
<int>,<int>,<IDate>,<IDate>
3026745,17,2009-04-23,2009-04-23
5469223,105,2013-06-18,2013-06-18
2099936,32,2021-02-18,2021-02-18
5152448,0,1997-05-07,1997-05-07
5944709,19,2017-12-13,2017-12-12

eid,ins_index,diag_icd9,diag_icd10
<int>,<int>,<chr>,<chr>
2097360,0,,D140
2622045,9,,Z800
4723574,4,,N183
5697013,17,,Z888
5858023,42,,M059

eid,ins_index,oper3,oper4
<int>,<int>,<int>,<chr>
2940276,4,,W822
3772102,12,,K634
5994223,4,,E492
4168836,0,,Y819
4239222,0,,Y767

eid,data_provider,date,read_2,read_3,value1,value2,value3
<int>,<int>,<IDate>,<chr>,<chr>,<chr>,<chr>,<chr>
1545611,3,2001-07-11,,XE2JU,,,
3261273,3,2005-02-08,,XaF6J,,,
5677515,3,2015-05-12,,42N..,0.4,,
4823906,3,2014-03-26,,XaFsp,,,
2656445,3,2008-04-16,,2469.,174.0,,

eid,data_provider,date,read_2,bnf_code,dmd_code,drug_name,quantity
<int>,<int>,<IDate>,<chr>,<chr>,<int64>,<chr>,<chr>
2894391,3,2001-11-08,,04.07.02.00.00,,Kapake 30mg/500mg tablets (Galen Ltd),100 tablet(s)
1469022,4,2011-12-28,blf2.,,,,
4507779,3,2015-11-24,,03.01.01.03.00,,Ventolin 100micrograms/dose Evohaler (GlaxoSmithKline UK Ltd),400 dose
1548822,2,2006-12-29,,0501030I0AAABAB,,DOXYCYCLINE CAPSULES 100MG,8.000
5381837,1,2016-06-14,bu51.00,,319799004.0,Clopidogrel 75mg tablets,28.000

eid,ntprobnp,glp1r
<int>,<dbl>,<dbl>
4751267,-0.2371,0.1078
2456178,-0.46295,
2375961,0.5382,-0.1656
1594698,-1.3814,-0.0243
2456602,-1.8147,-0.0173

eid,ntprobnp
<int>,<dbl>
5436027,-0.8305
1463364,2.1767
5373407,-0.9827
1611807,1.0228
3299269,2.1778

eid,ntprobnp
<int>,<dbl>
1463364,0.0978
5373407,0.1798
1611807,0.5227
3299269,1.2781
4911943,3.802


## Data processing

In [44]:
codes <- fread(file.path("/opt/notebooks", "heRmes", "inst", "extdata", "hermes_furosemide_codes", "hermes_furosemide_codes.tsv"))
codes[, code := sub("^'(.+?)'$", "\\1", code)]
head(codes)

concept,code,code_type,description
<chr>,<chr>,<chr>,<chr>
Heart Failure,1076,ukbb_self_reported_illness,heart failure/pulmonary odema
Heart Failure,0205052AEAAAAAA,bnf,Sacubitril/Valsartan_Tab 49mg/51mg
Heart Failure,0205052AEAAABAB,bnf,Sacubitril/Valsartan_Tab 97mg/103mg
Heart Failure,0205052AEAAACAC,bnf,Sacubitril/Valsartan_Tab 24mg/26mg
Heart Failure,0205052AEBBAAAA,bnf,Entresto_Tab 49mg/51mg
Heart Failure,0205052AEBBABAB,bnf,Entresto_Tab 97mg/103mg


### Clean up the cohort data

In [45]:
ethnicity_codes <- list(
  white                 = 1,
  british               = 1001,
  white_black_caribbean = 2001,
  indian                = 3001,
  caribbean             = 4001,
  mixed                 = 2,
  irish                 = 1002,
  white_black_african   = 2002,
  pakistani             = 3002,
  african               = 4002,
  asian_or_asian_british= 3,
  any_other_white       = 1003,
  white_asian           = 2003,
  bangladeshi           = 3003,
  any_other_black       = 4003,
  black_or_black_british= 4,
  any_other_mixed       = 2004,
  any_other_asian       = 3004,
  chinese               = 5,
  other_ethnic_group    = 6)

data$participant[, ethnicity := fcoalesce(.SD), .SDcols = names(data$participant)[grepl("^ethnicity_[0-9]$", names(data$participant))]]

data$demog <- data$participant[, 
    list(eid               = eid,
         reason_lost_fu    = reason_lost_fu,
         age               = as.integer(age),
         sex               = factor(sex, levels = 0:1, labels = c("female", "male")),
         ethnicity         = factor(ethnicity, levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         ethnicity_group   = factor(sub("([0-9])00[0-9]", "\\1", ethnicity), levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         genetic_sex       = factor(genetic_sex, levels = 0:1, labels = c("female", "male")),
         genetic_ethnicity = factor(genetic_ethnicity, levels = 1, labels = c("caucasian")), 
         pc1               = pc1,
         pc2               = pc2,
         pc3               = pc3,
         pc4               = pc4,
         pc5               = pc5)]

### Self-report illness codes to long

In [46]:
self_rep_code_cols <- grep("self_rep_ill_[0-9]+",      names(data$participant), value = TRUE)
self_rep_year_cols <- grep("self_rep_ill_year_[0-9]+", names(data$participant), value = TRUE)
data$participant[, (self_rep_code_cols) := lapply(.SD, as.character), .SDcols = self_rep_code_cols]
data$participant[, (self_rep_year_cols) := lapply(.SD, as.numeric),   .SDcols = self_rep_year_cols]
data$self_illness <- data.table::melt(data$participant,
                                      id.vars = "eid",
                                      measure = patterns("self_rep_ill_[0-9]+", "self_rep_ill_year_[0-9]+"),
                                      variable.name = "element",
                                      value.name = c("code", "year"),
                                      na.rm = TRUE)
data$self_illness <- data$self_illness[year != -1 & year != -3] # unknown / prefer not to answer
data$self_illness[, `:=`(date      = lubridate::ymd(paste0(as.character(floor(year)), "-01-01")) + lubridate::days(as.integer(365.25 * (year - floor(year)))),
                         year      = NULL,
                         element   = NULL,
                         code      = as.character(code),
                         code_type = "ukbb_self_reported_illness")]

# check self report illness table
stopifnot("unable to parse dates for self-reported illness codes" = all(!is.na(data$self_illness$date)))
stopifnot("are you sure something happened before 1900?" = all(data$self_illness$date > as.Date("1900-01-01")))

### Self-report medication codes to long

In [47]:
self_med_code_cols <- grep("self_rep_med_[0-9]+", names(data$participant), value = TRUE)
data$participant[, (self_med_code_cols) := lapply(.SD, as.character), .SDcols = self_med_code_cols]
data$smed <- data.table::melt(data$participant,
                              id.vars = "eid",
                              measure = patterns("self_rep_med_[0-9]+"),
                              variable.name = "element",
                              value.name = c("code"),
                              na.rm = TRUE)
data$smed[, `:=`(date      = as.Date(NA_real_),
                 element   = NULL,
                 code      = as.character(code),
                 code_type = "ukbb_self_reported_medication")]

### OLINK codes to long

In [48]:
data$olink <- rbindlist(list(
    data$olink_instance_0[data$participant, date := i.sample_collection_date_1, on="eid"] |> melt(id.vars=c("eid","date"), measure.vars=c("ntprobnp","glp1r"), value.name="value", variable.name="measure"), 
    data$olink_instance_2[data$participant, date := i.sample_collection_date_3, on="eid"] |> melt(id.vars=c("eid","date"), measure.vars=c("ntprobnp"), value.name="value", variable.name="measure"), 
    data$olink_instance_3[data$participant, date := i.sample_collection_date_4, on="eid"] |> melt(id.vars=c("eid","date"), measure.vars=c("ntprobnp"), value.name="value", variable.name="measure")
)) 
data$olink <- data$olink[, .(eid, date=as.Date(date), code=paste(measure, value, sep="="), code_type = "olink")] 

eid,date,code,code_type
<int>,<date>,<chr>,<chr>
4751267,2008-01-10,ntprobnp=-0.2371,olink
2456178,2009-01-17,ntprobnp=-0.46295,olink
2375961,2009-03-19,ntprobnp=0.5382,olink
1594698,2008-09-16,ntprobnp=-1.3814,olink
2456602,2008-09-16,ntprobnp=-1.8147,olink
3431545,2010-07-06,ntprobnp=0.1996,olink


### Inpatient diagnosis codes

In [49]:
data$hesin[is.na(epistart) | epistart == "", epistart := admidate]
data$hes_diag[data$hesin, date := as.Date(i.epistart), on = c("eid", "ins_index")]
data$hes_diag[diag_icd9 == "", diag_icd9 := NA_character_]
data$hes_diag[diag_icd10 == "", diag_icd10 := NA_character_]
data$hes_diag <- data.table::melt(data$hes_diag,
                                  id.vars = c("eid", "date"),
                                  measure.vars  = c("diag_icd9", "diag_icd10"),
                                  variable.name = "code_type",
                                  value.name = "code",
                                  na.rm = TRUE)
data$hes_diag[, code_type := data.table::fcase(code_type == "diag_icd9", "icd9",
                                               code_type == "diag_icd10", "icd10")]

### GP activity data

In [50]:
data$gp_clinical[read_2 == "", read_2 := NA_character_]
data$gp_clinical[read_3 == "", read_3 := NA_character_]
data$gp_clinical <- data.table::melt(data$gp_clinical,
                                     id.vars = c("eid", "date"),
                                     measure.vars  = c("read_2", "read_3"),
                                     variable.name = "code_type",
                                     value.name = "code",
                                     na.rm = TRUE)
data$gp_clinical[, code_type := data.table::fcase(code_type == "read_2", "read2",
                                                  code_type == "read_3", "read3")]

### GP medication data

In [51]:
data$gp_scripts[read_2 == "", read_2 := NA_character_]
data$gp_scripts[bnf_code == "", bnf_code := NA_character_]
data$gp_scripts[, dmd_code := as.character(dmd_code)]
data$gp_scripts <- data.table::melt(data$gp_scripts,
                                    id.vars = c("eid", "date"),
                                    measure.vars  = c("read_2", "bnf_code", "dmd_code"),
                                    variable.name = "code_type",
                                    value.name = "code",
                                    na.rm = TRUE)
data$gp_scripts[, code_type := data.table::fcase(code_type == "read_2",   "read_med",
                                                 code_type == "bnf_code", "bnf",
                                                 code_type == "dmd_code", "dmd")]

### Combine all codes and annotate concepts

In [52]:
combined <- rbind(data$self_illness, data$smed, data$hes_diag, data$gp_clinical, data$gp_scripts, data$olink)
combined <- codes[combined, on = c("code" = "code", "code_type" = "code_type"), allow.cartesian = TRUE]
head(combined)

concept,code,code_type,description,eid,date
<chr>,<chr>,<chr>,<chr>,<int>,<date>
,1351,ukbb_self_reported_illness,,1000074,2006-07-02
,1086,ukbb_self_reported_illness,,1000194,1962-07-02
,1065,ukbb_self_reported_illness,,1000258,2005-03-15
,1436,ukbb_self_reported_illness,,1000280,1992-05-26
,1154,ukbb_self_reported_illness,,1000299,2007-07-02
,1226,ukbb_self_reported_illness,,1000314,1995-07-02


### Save all ICD-10 codes

In [53]:
library(data.table)
all_icd10_codes <- combined[code_type=="icd10"][, .(all_icd10_codes = paste0(unique(code),collapse=";")), by=eid]
data$demog[all_icd10_codes, all_icd10_codes := i.all_icd10_codes, on="eid"]

### Annotate individuals with code:code_type counts

In [54]:
combined[code_type=="olink", c("concept", "code") := tstrsplit(code, "=", fixed = TRUE)]
combined <- combined[!is.na(concept)] # discard non-annotated codes
combined[, concept := tolower(gsub(" ","_",concept))] # rename
combined[, code := paste0(date,"#",code)]
d <- dcast(combined, eid ~ concept + code_type, value.var = "code",  
           fun.aggregate = function(x) paste(x, collapse = ";"))
d[, heart_failure := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^heart_failure", names(d))]]
d[, loop_diuretic := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^loop_diuretic", names(d))]]

### Combine with full cohort

In [55]:
base_cols <- c("eid", "age", "sex", "ethnicity", "ethnicity_group","genetic_sex", "genetic_ethnicity", paste0("pc",1:5), "all_icd10_codes")
measure_cols <- names(d)[names(d) != "eid"]
cohort <- d[data$demog[,mget(base_cols)], on="eid"]
#head(cohort)
cohort[, (measure_cols) := lapply(.SD, function(x) fifelse(is.na(x),"",x)), .SDcols=measure_cols]
setcolorder(cohort, base_cols)
head(cohort)
                                  
cohort[data$gp_clinical[,"eid"], any_gp := "1", on="eid"][is.na(any_gp), any_gp := ""]
cohort[data$gp_scripts[,"eid"], any_gp_med := "1", on="eid"][is.na(any_gp_med), any_gp_med := ""]
cohort[data$hes_diag[,"eid"], any_hes := "1", on="eid"][is.na(any_hes), any_hes := ""]

measure_cols <- names(cohort)[!names(cohort) %in% base_cols]
summary <- data.table (name = c("total", measure_cols), N = c(nrow(cohort), cohort[, .(sapply(.SD, function(x) sum(!is.na(x) & x!=""))), .SDcols = measure_cols]$V1))
head(summary,100)                                                          

fwrite(summary,
       file = "hermes_furosemide_phenotype_summary.tsv",
       sep  = "\t")
                                                                                              
# write out
fwrite(cohort,
       file = "hermes_furosemide_phenotypes.tsv.gz",
       sep  = "\t")

eid,age,sex,ethnicity,ethnicity_group,genetic_sex,genetic_ethnicity,pc1,pc2,pc3,⋯,heart_failure_read3,heart_failure_read_med,heart_failure_ukbb_self_reported_illness,loop_diuretic_bnf,loop_diuretic_dmd,loop_diuretic_read_med,loop_diuretic_ukbb_self_reported_medication,ntprobnp_olink,heart_failure,loop_diuretic
<int>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1000074,42,female,british,white,female,caucasian,-11.2546,3.86747,1.69413,⋯,,,,,,,,,,
1000194,60,male,british,white,male,caucasian,-11.9589,2.58523,-2.53233,⋯,,,,,,,,,,
1000258,56,male,british,white,male,,-12.7644,0.704641,0.550128,⋯,,,,,,,,,,
1000280,44,male,british,white,male,caucasian,-12.3484,4.46579,-1.5814,⋯,,,,,,,,,,
1000299,45,female,british,white,female,caucasian,-12.3153,2.79696,0.408848,⋯,,,,,,,,,,
1000314,56,female,british,white,female,caucasian,-11.3328,2.67884,-0.373868,⋯,,,,,,,,,,


name,N
<chr>,<int>
total,502134
glp1r_olink,53013
heart_failure_bnf,1573
heart_failure_icd10,20147
heart_failure_icd9,20
heart_failure_read2,1014
heart_failure_read3,1563
heart_failure_read_med,1
heart_failure_ukbb_self_reported_illness,398
loop_diuretic_bnf,13895


In [None]:
### Copy output to project

In [None]:
o <- system("dx upload hermes_furosemide_phenotype_summary.tsv hermes_furosemide_phenotypes.tsv.gz --destination hermes_furosemide_data", intern = TRUE)
cat(o, sep = "\n")