## HERMES - furosemide phenotype

#### Clone HeRmes repository

In [None]:
#system("git clone https://github.com/nicksunderland/heRmes.git")

#### Pushing changes to Github

In [None]:
# in ther terminal run
# cd /opt/notebooks
# dx download git_push.sh
# run bash git_push.sh 

#### Project & record ID

In [1]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
recordid  <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"

#### Libraries

In [17]:
library(glue)
library(data.table)
library(yaml)
source("/opt/notebooks/heRmes/R/ukbb_extraction_utils.R")

## Extract data

#### Download data dictionary

In [13]:
setwd("/opt/notebooks")
dataset <- glue("{projectid}:{recordid}")
cmd <- glue("dx extract_dataset {dataset} -ddd")
system(cmd)
dict_files <- list.files(pattern="codings|data_dictionary|entity_dictionary")
data_dict_file <- dict_files[grepl("data_dictionary", dict_files)]

#### Read the extraction config file

In [20]:
config <- read_yaml("/opt/notebooks/heRmes/scripts/extraction_config.yml")

extraction_template <- lapply(config, function(table) filter_data_dict(data_dict_file, table$entity, table$columns))
                       
lapply(extraction_template, head, 3)               

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,participant,eid,string,global,,,,Participant Information,,,,,,,Participant ID,
reason_lost_fu,participant,p190,integer,,data_coding_1965,,,Population characteristics > Ongoing characteristics,,,http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=190,,,,Reason lost to follow-up,
sex,participant,p31,integer,,data_coding_9,,,Population characteristics > Baseline characteristics,,,http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=31,,,,Sex,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,hesin,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,
ins_index,hesin,ins_index,integer,,,,,,,,,,,,Instance index,
epistart,hesin,epistart,date,,,,,,,,,,,,Episode start date,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,hesin_diag,eid,string,,,,,,,,,,,,Participant ID,
ins_index,hesin_diag,ins_index,integer,,,,,,,,,,,,Instance index,
diag_icd9,hesin_diag,diag_icd9,string,,data_coding_87,,,,,,,,,,Diagnoses - ICD9,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,hesin_oper,eid,string,,,,,,,,,,,,Participant ID,
ins_index,hesin_oper,ins_index,integer,,,,,,,,,,,,Instance index,
oper3,hesin_oper,oper3,string,,data_coding_259,,,,,,,,,,Operative procedures - OPCS3,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,gp_clinical,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,
data_provider,gp_clinical,data_provider,string,,data_coding_626,,,,,,,,,,Data provider,
date,gp_clinical,event_dt,date,,data_coding_819,,,,,yes,,,,,Date clinical code was entered,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,gp_scripts,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,
data_provider,gp_scripts,data_provider,string,,data_coding_626,,,,,,,,,,Data provider,
date,gp_scripts,issue_date,date,,data_coding_819,,,,,yes,,,,,Date prescription was issued,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,olink_instance_0,eid,string,local,,,,,,,,,participant:eid,one_to_one,Participant ID,
ntprobnp,olink_instance_0,ntprobnp,float,,,,,,,,,,,,NTproBNP;N-terminal prohormone of brain natriuretic peptide,
glp1r,olink_instance_0,glp1r,float,,,,,,,,,,,,GLP1R;Glucagon-like peptide 1 receptor,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,olink_instance_2,eid,string,local,,,,,,,,,participant:eid,one_to_one,Participant ID,
ntprobnp,olink_instance_2,ntprobnp,float,,,,,,,,,,,,NTproBNP;N-terminal prohormone of brain natriuretic peptide,

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,olink_instance_3,eid,string,local,,,,,,,,,participant:eid,one_to_one,Participant ID,
ntprobnp,olink_instance_3,ntprobnp,float,,,,,,,,,,,,NTproBNP;N-terminal prohormone of brain natriuretic peptide,


#### Run Table-Exporter extraction

In [None]:
for (table in names(config)) {
    
    if (!file.exists(config[[table]][["output"]])) {
        extract_data(dataset = dataset, 
                     fields  = extraction_template[[table]],
                     entity  = config[[table]][["entity"]],
                     output  = config[[table]][["output"]])
    }
    
}

if (!file.exists(data_file_paths$demog)) {
    extract_data(dataset=dataset, fields=participant_data_dict$name, entity="participant", output = "data_participant")
}
if (!file.exists(data_file_paths$self)) {
    extract_data(dataset=dataset, fields=self_rep_data_dict$name,    entity="participant", output = "data_selfreportedillness")
}
if (!file.exists(data_file_paths$smed)) {
    extract_data(dataset=dataset, fields=self_med_data_dict$name,    entity="participant", output = "data_selfreportedmedication")
}
if (!file.exists(data_file_paths$hesin)) {
    extract_data(dataset=dataset, fields=hes_data_dict$name,         entity="hesin",       output = "data_hesin")
}
if (!file.exists(data_file_paths$diag)) {
    extract_data(dataset=dataset, fields=hesdiag_data_dict$name,     entity="hesin_diag",  output = "data_hesin_diag")
}
if (!file.exists(data_file_paths$oper)) {
    extract_data(dataset=dataset, fields=hesoper_data_dict$name,     entity="hesin_oper",  output = "data_hesin_oper")
}
if (!file.exists(data_file_paths$gp)) {
    extract_data(dataset=dataset, fields=gp_clinical_data_dict$name, entity="gp_clinical", output = "data_gp_clinical")
}
if (!file.exists(data_file_paths$med)) {
    extract_data(dataset=dataset, fields=gp_medication_data_dict$name, entity="gp_scripts", output = "data_gp_medication")
}

## Read in extracted data

In [None]:
data_files <- list()

for (i in seq_along(data_file_paths)) {
    if (!file.exists(data_file_paths[[i]])) {
        cat(glue("Error:\nFile {basename(data_file_paths[[i]])} not found, ",
                 "check the Monitor tab for the status of the Table-exporter ",
                 "and the 'hermes_data' folder. If this has finished try ",
                 "launching another Notebook session/instance (I'm not sure why ",
                 "the mounted /mnt/project/ file structure doesn't refresh when ",
                 "files are added externally."))
        stop("file not found error")
    }
    f <- data_file_paths[[i]]
    n <- names(data_file_paths)[i]
    cat(glue('...{n}: {f}\n'), sep="\n")
    flush.console()
    data_files[[n]] <- fread(f, quote="")
}

#lapply(data_files, head, n = 5)

## Rename columns

In [None]:
rename_cols <- function(d, code_struc) {
    for (col in names(code_struc)) {
        if (code_struc[[col]]$search=="matches") {
            setnames(d, code_struc[[col]]$name, col)
        } else if (code_struc[[col]]$search=="startswith") {
            regex     <- paste0("^", code_struc[[col]]$name)
            matches   <- names(d)[grepl(regex, names(d))]
            new_names <- paste0(col, "_", 1:length(matches))
            setnames(d, matches, new_names)
        }
    }
    return(d)
}

data_files$demog <- rename_cols(data_files$demog, code_struc=participant_codes)
data_files$self  <- rename_cols(data_files$self,  code_struc=self_illness_codes)
data_files$smed  <- rename_cols(data_files$smed,  code_struc=self_medication_codes)
data_files$hesin <- rename_cols(data_files$hesin, code_struc=hesin_to_extract)
data_files$diag  <- rename_cols(data_files$diag,  code_struc=hesdiag_to_extract)
data_files$oper  <- rename_cols(data_files$oper,  code_struc=hesproc_to_extract)
data_files$gp    <- rename_cols(data_files$gp,    code_struc=gp_clinical_codes)
data_files$med   <- rename_cols(data_files$med,   code_struc=gp_medication_codes)

#lapply(data_files, head, n = 1)

## Data processing

In [None]:
codes <- fread(file.path("heRmes", "inst", "extdata", "hermes_furosemide_codes", "hermes_furosemide_codes.tsv"))
codes[, code := sub("^'(.+?)'$", "\\1", code)]
head(codes)

### Clean up the cohort data

In [None]:
ethnicity_codes <- list(
  white                 = 1,
  british               = 1001,
  white_black_caribbean = 2001,
  indian                = 3001,
  caribbean             = 4001,
  mixed                 = 2,
  irish                 = 1002,
  white_black_african   = 2002,
  pakistani             = 3002,
  african               = 4002,
  asian_or_asian_british= 3,
  any_other_white       = 1003,
  white_asian           = 2003,
  bangladeshi           = 3003,
  any_other_black       = 4003,
  black_or_black_british= 4,
  any_other_mixed       = 2004,
  any_other_asian       = 3004,
  chinese               = 5,
  other_ethnic_group    = 6)

data_files$demog[, ethnicity := fcoalesce(.SD), .SDcols = names(data_files$demog)[grepl("^ethnicity_[0-9]$", names(data_files$demog))]]

data_files$demog <- data_files$demog[, 
    list(eid               = eid,
         reason_lost_fu    = reason_lost_fu,
         age               = as.integer(age),
         sex               = factor(sex, levels = 0:1, labels = c("female", "male")),
         ethnicity         = factor(ethnicity, levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         ethnicity_group   = factor(sub("([0-9])00[0-9]", "\\1", ethnicity), levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
         genetic_sex       = factor(genetic_sex, levels = 0:1, labels = c("female", "male")),
         genetic_ethnicity = factor(genetic_ethnicity, levels = 1, labels = c("caucasian")), 
         pc1               = pc1,
         pc2               = pc2,
         pc3               = pc3,
         pc4               = pc4,
         pc5               = pc5)]

# check
stopifnot("Failed to parse some date of births" = all(!is.na(data_files$demog$dob)))
stopifnot("some ages / dob indicate cohort age <37, is this right?" = all(data_files$demog$dob <= as.Date("1972-01-01")))

### Self-report illness codes to long

In [None]:
self_rep_code_cols <- grep("self_rep_ill_[0-9]+",      names(data_files$self), value = TRUE)
self_rep_year_cols <- grep("self_rep_ill_year_[0-9]+", names(data_files$self), value = TRUE)
data_files$self[, (self_rep_code_cols) := lapply(.SD, as.character), .SDcols = self_rep_code_cols]
data_files$self[, (self_rep_year_cols) := lapply(.SD, as.numeric),   .SDcols = self_rep_year_cols]
data_files$self_illness <- data.table::melt(data_files$self,
                                            id.vars = "eid",
                                            measure = patterns("self_rep_ill_[0-9]+", "self_rep_ill_year_[0-9]+"),
                                            variable.name = "element",
                                            value.name = c("code", "year"),
                                            na.rm = TRUE)
data_files$self_illness <- data_files$self_illness[year != -1 & year != -3] # unknown / prefer not to answer
data_files$self_illness[, `:=`(date      = lubridate::ymd(paste0(as.character(floor(year)), "-01-01")) + lubridate::days(as.integer(365.25 * (year - floor(year)))),
                               year      = NULL,
                               element   = NULL,
                               code      = as.character(code),
                               code_type = "ukbb_self_reported_illness")]

# check self report illness table
stopifnot("unable to parse dates for self-reported illness codes" = all(!is.na(data_files$self_illness$date)))
stopifnot("are you sure something happened before 1900?" = all(data_files$self_illness$date > as.Date("1900-01-01")))

### Self-report medication codes to long

In [None]:
self_med_code_cols <- grep("self_rep_med_[0-9]+", names(data_files$smed), value = TRUE)
data_files$smed[, (self_med_code_cols) := lapply(.SD, as.character), .SDcols = self_med_code_cols]

data_files$smed <- data.table::melt(data_files$smed,
                                    id.vars = "eid",
                                    measure = patterns("self_rep_med_[0-9]+"),
                                            variable.name = "element",
                                            value.name = c("code"),
                                            na.rm = TRUE)
data_files$smed[, `:=`(date      = as.Date(NA_real_),
                       element   = NULL,
                       code      = as.character(code),
                       code_type = "ukbb_self_reported_medication")]

### Inpatient diagnosis codes

In [None]:
data_files$hesin[is.na(epistart) | epistart == "", epistart := admidate]
data_files$diag[data_files$hesin, date := as.Date(i.epistart), on = c("eid", "ins_index")]
data_files$diag[diag_icd9 == "", diag_icd9 := NA_character_]
data_files$diag[diag_icd10 == "", diag_icd10 := NA_character_]
data_files$diag <- data.table::melt(data_files$diag,
                                    id.vars = c("eid", "date"),
                                    measure.vars  = c("diag_icd9", "diag_icd10"),
                                    variable.name = "code_type",
                                    value.name = "code",
                                    na.rm = TRUE)
data_files$diag[, code_type := data.table::fcase(code_type == "diag_icd9", "icd9",
                                                 code_type == "diag_icd10", "icd10")]

### GP activity data

In [None]:
data_files$gp[read_2 == "", read_2 := NA_character_]
data_files$gp[read_3 == "", read_3 := NA_character_]
data_files$gp <- data.table::melt(data_files$gp,
                                  id.vars = c("eid", "date"),
                                  measure.vars  = c("read_2", "read_3"),
                                  variable.name = "code_type",
                                  value.name = "code",
                                  na.rm = TRUE)
data_files$gp[, code_type := data.table::fcase(code_type == "read_2", "read2",
                                               code_type == "read_3", "read3")]

### GP medication data

In [None]:
data_files$med[read_2 == "", read_2 := NA_character_]
data_files$med[bnf_code == "", bnf_code := NA_character_]
data_files$med[, dmd_code := as.character(dmd_code)]
data_files$med <- data.table::melt(data_files$med,
                                   id.vars = c("eid", "date"),
                                   measure.vars  = c("read_2", "bnf_code", "dmd_code"),
                                   variable.name = "code_type",
                                   value.name = "code",
                                   na.rm = TRUE)
data_files$med[, code_type := data.table::fcase(code_type == "read_2",   "read_med",
                                                code_type == "bnf_code", "bnf",
                                                code_type == "dmd_code", "dmd")]

### Combine all codes and annotate concepts

In [None]:
combined <- rbind(data_files$self_illness, data_files$smed, data_files$diag, data_files$gp, data_files$med)
combined <- codes[combined, on = c("code" = "code", "code_type" = "code_type"), allow.cartesian = TRUE]

### Save all ICD-10 codes

In [None]:
all_icd10_codes <- combined[code_type=="icd10"][, .(all_icd10_codes = paste0(unique(code),collapse=";")), by=eid]
data_files$demog[all_icd10_codes, all_icd10_codes := i.all_icd10_codes, on="eid"]

### Annotate individuals with code:code_type counts

In [None]:
combined <- combined[!is.na(concept)] # discard non-annotated codes
combined[, concept := tolower(gsub(" ","_",concept))] # rename
d <- dcast(combined, eid ~ concept + code_type, value.var = "code",  
           fun.aggregate = function(x) paste(x, collapse = ";"))
d[, heart_failure := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^heart_failure", names(d))]]
d[, loop_diuretic := ifelse(apply(.SD, 1, function(x) any(x != "" & !is.na(x))), "1", ""), .SDcols = names(d)[grepl("^loop_diuretic", names(d))]]

### Combine with full cohort

In [None]:
base_cols <- c("eid", "age", "sex", "ethnicity", "ethnicity_group","genetic_sex", "genetic_ethnicity", paste0("pc",1:5), "all_icd10_codes")
measure_cols <- names(d)[names(d) != "eid"]
cohort <- d[data_files$demog[,mget(base_cols)], on="eid"]
#head(cohort)
cohort[, (measure_cols) := lapply(.SD, function(x) fifelse(is.na(x),"",x)), .SDcols=measure_cols]
setcolorder(cohort, base_cols)
head(cohort)
                                  
cohort[data_files$gp[,"eid"], any_gp := "1", on="eid"][is.na(any_gp), any_gp := ""]
cohort[data_files$med[,"eid"], any_gp_med := "1", on="eid"][is.na(any_gp_med), any_gp_med := ""]
cohort[data_files$diag[,"eid"], any_hes := "1", on="eid"][is.na(any_hes), any_hes := ""]

measure_cols <- names(cohort)[!names(cohort) %in% base_cols]
summary <- data.table (name = c("total", measure_cols), N = c(nrow(cohort), cohort[, .(sapply(.SD, function(x) sum(!is.na(x) & x!=""))), .SDcols = measure_cols]$V1))
head(summary,100)                                                          

fwrite(summary,
       file = "hermes_furosemide_phenotype_summary.tsv",
       sep  = "\t")
                                                                                              
# write out
fwrite(cohort,
       file = "hermes_furosemide_phenotypes.tsv.gz",
       sep  = "\t")

In [None]:
### Copy output to project

In [None]:
o <- system("dx upload hermes_furosemide_phenotype_summary.tsv hermes_furosemide_phenotypes.tsv.gz --destination hermes_furosemide_data", intern = TRUE)
cat(o, sep = "\n")