# HERMES 3.0 UK Biobank Phenotyping

### Libraries

In [87]:
system("pip3 show dxpy", intern = TRUE)
library(glue)
library(data.table)

### Project & dataset IDs

In [88]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
rid       <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"
dataset   <- glue("{projectid}:{rid}")
cat("Project record ID:", dataset, "\n")

Project record ID: project-GvZyZ20J81vgPJGbJy8pgpyq:record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg 


## Data extraction

### Download data dictionary

In [89]:
setwd("/opt/notebooks")
cmd <- glue("dx extract_dataset {dataset} -ddd")
system(cmd)
dict_files <- list.files(pattern="codings|data_dictionary|entity_dictionary")
data_dict_file <- dict_files[grepl("data_dictionary", dict_files)]

#### Data dictionary filter function

In [None]:
extract_data <- function(dataset, fields, output) {
    cmd <- glue::glue("dx extract_dataset {dataset} --fields {paste(fields, collapse=',')} --delimiter ',' -o {output}")
    system(cmd, intern = FALSE, wait = TRUE, ignore.stdout = FALSE, ignore.stderr = FALSE)
}

filter_data_dict <- function(dict_path, codes_str) {
    
    d <- lapply(codes_str, function(x) {
        
        if (x$search=="matches") {
            data_dict[entity==x$entity & name==x$name]
        } else if (x$search=="startswith") {
            data_dict[entity==x$entity & grepl(paste0("^", x$name), name)]
        }
        
    }) |> rbindlist(idcol = "item")
    
    d[, field_id := paste(entity, name, sep=".")]
    
    return(d)
}

### Extract participant data

In [104]:
participant_codes= list(eid                = list(name="eid",       entity="participant", search="matches"),
                        reason_lost_fu     = list(name="p190",      entity="participant", search="matches"),
                        sex                = list(name="p31",       entity="participant", search="matches"),
                        age                = list(name="p21022",    entity="participant", search="matches"),
                        ethnicity          = list(name="p21000",    entity="participant", search="matches"),
                        genetic_sex        = list(name="p22001",    entity="participant", search="matches"),
                        genetic_ethnicity  = list(name="p22006",    entity="participant", search="matches"),
                        pc1                = list(name="p22009_a1", entity="participant", search="matches"),
                        pc2                = list(name="p22009_a2", entity="participant", search="matches"),
                        pc3                = list(name="p22009_a3", entity="participant", search="matches"),
                        pc4                = list(name="p22009_a4", entity="participant", search="matches"),
                        pc5                = list(name="p22009_a5", entity="participant", search="matches"))

                        # self_rep_ill       = list(name="p20002",    entity="participant", search="startswith"),
                        # self_rep_ill_year  = list(name="p20008",    entity="participant", search="startswith"),
                        # self_rep_proc      = list(name="p20004",    entity="participant", search="startswith"),
                        # self_rep_proc_year = list(name="p20010",    entity="participant", search="startswith"))

filt_data_dict = filter_data_dict(data_dict_file, participant_codes)
head(filt_data_dict, 3)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,participant,eid,string,global,,,,Participant Information,,,,,,,Participant ID,,participant.eid
reason_lost_fu,participant,p190,integer,,data_coding_1965,,,Population characteristics > Ongoing characteristics,,,http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=190,,,,Reason lost to follow-up,,participant.p190
sex,participant,p31,integer,,data_coding_9,,,Population characteristics > Baseline characteristics,,,http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=31,,,,Sex,,participant.p31


In [105]:
extract_data(dataset, fields = filt_data_dict$field_id, output = "data_participant.csv")

### Extract self-report illness data

In [None]:
self_report_codes= list(# self_rep_ill       = list(name="p20002",    entity="participant", search="startswith"),
                        # self_rep_ill_year  = list(name="p20008",    entity="participant", search="startswith"),
                        # self_rep_proc      = list(name="p20004",    entity="participant", search="startswith"),
                        # self_rep_proc_year = list(name="p20010",    entity="participant", search="startswith"))

filt_data_dict = filter_data_dict(data_dict_file, participant_codes)
head(filt_data_dict, 3)

### Extract HES inpatient data

In [83]:
hesin_to_extract = list(eid                = list(name="eid",       entity="hesin", search="matches"),
                        ins_index          = list(name="ins_index", entity="hesin", search="matches"),
                        epistart           = list(name="epistart",  entity="hesin", search="matches"),
                        admidate           = list(name="admidate",  entity="hesin", search="matches"))

filt_data_dict = filter_data_dict(data_dict_file, hesin_to_extract)
head(filt_data_dict, 4)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,hesin,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,,hesin.eid
ins_index,hesin,ins_index,integer,,,,,,,,,,,,Instance index,,hesin.ins_index
epistart,hesin,epistart,date,,,,,,,,,,,,Episode start date,,hesin.epistart
admidate,hesin,admidate,date,,,,,,,,,,,,Date of admission to hospital,,hesin.admidate


In [97]:
extract_data(dataset, fields = filt_data_dict$field_id, output = "data_hesin.csv")

### Extract HES diagnoses

In [34]:
hesdiag_to_extract=list(eid                = list(name="eid",       entity="hesin_diag", search="matches"),
                        ins_index          = list(name="ins_index", entity="hesin_diag", search="matches"),
                        epistart           = list(name="epistart",  entity="hesin_diag", search="matches"),
                        admidate           = list(name="admidate",  entity="hesin_diag", search="matches"))

filt_data_dict = filter_data_dict(data_dict_file, hesdiag_to_extract)
head(filt_data_dict, 4)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,hesin_diag,eid,string,,,,,,,,,,,,Participant ID,,hesin_diag.eid
ins_index,hesin_diag,ins_index,integer,,,,,,,,,,,,Instance index,,hesin_diag.ins_index
epistart,hesin_diag,diag_icd9,string,,data_coding_87,,,,,,,,,,Diagnoses - ICD9,,hesin_diag.diag_icd9
admidate,hesin_diag,diag_icd10,string,,data_coding_19,,,,,,,,,,Diagnoses - ICD10,,hesin_diag.diag_icd10


In [35]:
extract_data(dataset, fields = filt_data_dict$field_id, output = "data_hesin_diag.csv")

### Extract HES procedures

In [37]:
hesproc_to_extract=list(eid                = list(name="eid",       entity="hesin_oper", search="matches"),
                        ins_index          = list(name="ins_index", entity="hesin_oper", search="matches"),
                        epistart           = list(name="oper3",     entity="hesin_oper", search="matches"),
                        admidate           = list(name="oper4",     entity="hesin_oper", search="matches"))
filt_data_dict = filter_data_dict(data_dict_file, hesproc_to_extract)
head(filt_data_dict, 4)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,hesin_oper,eid,string,,,,,,,,,,,,Participant ID,,hesin_oper.eid
ins_index,hesin_oper,ins_index,integer,,,,,,,,,,,,Instance index,,hesin_oper.ins_index
epistart,hesin_oper,oper3,string,,data_coding_259,,,,,,,,,,Operative procedures - OPCS3,,hesin_oper.oper3
admidate,hesin_oper,oper4,string,,data_coding_240,,,,,,,,,,Operative procedures - OPCS4,,hesin_oper.oper4


In [38]:
extract_data(dataset, fields = filt_data_dict$field_id, output = "data_hesin_oper.csv")

## Data processing

### Read in the heRmes codes
ICD-9/10 coding is provided but we need to add the self reported codes from the UK-BB too.

In [None]:
codes <- fread(file.path("heRmes", "inst", "extdata", "hermes_3_codes", "hermes_3_codes.tsv"))
self_reported_codes <- list(
  list(name = "Heart Failure",                      code = "1076", code_type = "ukbb_self_reported_illness"),
  list(name = "Myocardial infarction",              code = "1075", code_type = "ukbb_self_reported_illness"),
  list(name = "Hypertrophic cardiomyopathy",        code = "1588", code_type = "ukbb_self_reported_illness"),
  list(name = "Coronary artery bypass grafting",    code = "1095", code_type = "ukbb_self_reported_procedure"),
  list(name = "Percutaneous coronary intervention", code = "1070", code_type = "ukbb_self_reported_procedure")
)
codes <- rbind(codes,
               data.table(Concept     = paste0(sapply(self_reported_codes, function(x) x$name), " Self Reported"),
                          Code        = sapply(self_reported_codes, function(x) x$code),
                          Source      = sapply(self_reported_codes, function(x) x$code_type),
                          Description = sapply(self_reported_codes, function(x) x$name)))
codes[, `:=`(code      = Code,
             code_type = fcase(Source=="ICD10", "icd10",
                               Source=="ICD9",  "icd9",
                               Source=="OPCS4", "opcs4",
                               Source=="ukbb_self_reported_illness", "ukbb_self_reported_illness",
                               Source=="ukbb_self_reported_procedure", "ukbb_self_reported_procedure"))]
codes <- codes[!is.na(code_type)]                                               

### Read in the extracted data

In [84]:
rename_cols <- function(d, code_struc) {
    for (col in names(code_struc)) {
        if (code_struc[[col]]$search=="matches") {
            setnames(d, paste(code_struc[[col]]$entity, code_struc[[col]]$name, sep="."), col)
        } else if (code_struc[[col]]$search=="matches") {
            matches   <- names(d)[grepl(paste0("^", code_struc[[col]]$entity, ".", code_struc[[col]]$name), names(d))]
            new_names <- paste0(col, "_", 1:length(matches))
            setnames(d, matches, new_names)
        }
    }
    return(d)
}

In [106]:
demog <- fread("data_participant.csv")
# hesin <- fread("data_hesin.csv")
# diag  <- fread("data_hesin_diag.csv")
# oper  <- fread("data_hesin_oper.csv")

demog <- rename_cols(demog, participant_codes)
# hesin <- rename_cols(hesin, hesin_to_extract)
# diag  <- rename_cols(diag,  hesdiag_to_extract)
# oper  <- rename_cols(oper,  hesproc_to_extract)

head(demog, 1)
# head(hesin, 1)
# head(diag,  1)
# head(oper,  1)

eid,reason_lost_fu,sex,age,genetic_sex,genetic_ethnicity,pc1,pc2,pc3,pc4,pc5
<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1000061,,1,57,1,1,-13.7016,2.07305,-1.47433,2.385,4.26327


### Clean up the cohort data

In [86]:
ethnicity_codes <- list(
  white                 = 1,
  british               = 1001,
  white_black_caribbean =	2001,
  indian                = 3001,
  caribbean             = 4001,
  mixed                 = 2,
  irish                 =	1002,
  white_black_african	  = 2002,
  pakistani             = 3002,
  african	              = 4002,
  asian_or_asian_british=	3,
  any_other_white       =	1003,
  white_asian           =	2003,
  bangladeshi           =	3003,
  any_other_black       =	4003,
  black_or_black_british=	4,
  any_other_mixed       =	2004,
  any_other_asian       =	3004,
  chinese               =	5,
  other_ethnic_group    = 6)

cohort <- demog[, list(eid               = eid,
                       age               = as.integer(age),
                       sex               = factor(sex, levels = 0:1, labels = c("female", "male")),
                       ethnicity         = factor(ethnicity, levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
                       ethnicity_group   = factor(sub("([0-9])00[0-9]", "\\1", ethnicity), levels = unlist(ethnicity_codes), labels = names(ethnicity_codes)),
                       genetic_sex       = factor(genetic_sex, levels = 0:1, labels = c("female", "male")),
                       genetic_ethnicity = factor(genetic_ethnicity, levels = 1, labels = c("caucasian")))]
cohort[demog, paste0("pc", 1:5) := mget(paste0("pc", 1:5)), on = "eid"]

# check
stopifnot("Failed to parse some date of births" = all(!is.na(cohort$dob)))
stopifnot("some ages / dob indicate cohort age <37, is this right?" = all(cohort$dob <= as.Date("1972-01-01")))

ERROR: Error in eval(jsub, SDenv, parent.frame()): object 'ethnicity' not found


### Self-report illness codes to long

In [None]:
self_rep_code_regex <- "20002-[0-9]+\\.[0-9]+"
self_rep_year_regex <- "20008-[0-9]+\\.[0-9]+"
self_rep_code_cols <- grep(self_rep_code_regex, names(demog), value = TRUE)
self_rep_year_cols <- grep(self_rep_year_regex, names(demog), value = TRUE)
demog[, (self_rep_code_cols) := lapply(.SD, as.character), .SDcols = self_rep_code_cols]
demog[, (self_rep_year_cols) := lapply(.SD, as.numeric),   .SDcols = self_rep_year_cols]
self_rep_illness <- data.table::melt(demog,
                                     id.vars = "eid",
                                     measure = patterns(self_rep_code_regex, self_rep_year_regex),
                                     variable.name = "element",
                                     value.name = c("code", "year"),
                                     na.rm = TRUE)
self_rep_illness <- self_rep_illness[year != -1 & year != -3] # unknown / prefer not to answer
self_rep_illness[, `:=`(date      = lubridate::ymd(paste0(as.character(floor(year)), "-01-01")) + lubridate::days(as.integer(365.25 * (year - floor(year)))),
                        year      = NULL,
                        element   = NULL,
                        code      = as.character(code),
                        code_type = "ukbb_self_reported_illness")]

# check self report illness table
stopifnot("unable to parse dates for self-reported illness codes" = all(!is.na(self_rep_illness$date)))
stopifnot("are you sure something happened before 1900?" = all(self_rep_illness$date > as.Date("1900-01-01")))

### Self-report procedure codes to long

In [None]:
self_rep_proc_code_regex <- "20004-[0-9]+\\.[0-9]+"
self_rep_proc_year_regex <- "20010-[0-9]+\\.[0-9]+"
self_rep_proc_code_cols <- grep(self_rep_proc_code_regex, names(self_oper), value = TRUE)
self_rep_proc_year_cols <- grep(self_rep_proc_year_regex, names(self_oper), value = TRUE)
self_oper[, (self_rep_proc_code_cols) := lapply(.SD, as.character), .SDcols = self_rep_proc_code_cols]
self_oper[, (self_rep_proc_year_cols) := lapply(.SD, as.numeric),   .SDcols = self_rep_proc_year_cols]
self_rep_oper <- data.table::melt(self_oper,
                                  id.vars = "eid",
                                  measure = patterns(self_rep_proc_code_regex, self_rep_proc_year_regex),
                                  variable.name = "element",
                                  value.name = c("code", "year"),
                                  na.rm = TRUE)
self_rep_oper <- self_rep_oper[year != -1 & year != -3] # unknown / prefer not to answer
self_rep_oper[, `:=`(date      = lubridate::ymd(paste0(as.character(floor(year)), "-01-01")) + lubridate::days(as.integer(365.25 * (year - floor(year)))),
                     year      = NULL,
                     element   = NULL,
                     code      = as.character(code),
                     code_type = "ukbb_self_reported_procedure")]

# check self report illness table
stopifnot("unable to parse dates for self-reported procedure codes" = all(!is.na(self_rep_oper$date)))
stopifnot("are you sure something happened before 1900?" = all(self_rep_oper$date > as.Date("1900-01-01")))

### Inpatient diagnosis codes

In [None]:
hesin[is.na(epistart) | epistart == "", epistart := admidate]
diag[hesin, date := as.Date(i.epistart), on = c("eid(participant - eid)", "ins_index")]
diag[, eid := `eid(participant - eid)`]
diag[diag_icd9 == "", diag_icd9 := NA_character_]
diag[diag_icd10 == "", diag_icd10 := NA_character_]
diag <- data.table::melt(diag,
                         id.vars = c("eid", "date"),
                         measure.vars  = c("diag_icd9", "diag_icd10"),
                         variable.name = "code_type",
                         value.name = "code",
                         na.rm = TRUE)
diag[, code_type := data.table::fcase(code_type == "diag_icd9", "icd9",
                                      code_type == "diag_icd10", "icd10")]

### Inpatient procedure codes

In [None]:
oper[hesin, date := as.Date(i.epistart), on = c("eid(participant - eid)", "ins_index")]
oper[, eid := `eid(participant - eid)`]
oper[oper3 == "", oper3 := NA_character_]
oper[oper4 == "", oper4 := NA_character_]
oper <- data.table::melt(oper,
                         id.vars = c("eid", "date"),
                         measure.vars  = c("oper3", "oper4"),
                         variable.name = "code_type",
                         value.name = "code",
                         na.rm = TRUE)
oper[, code_type := data.table::fcase(code_type == "oper3", "opcs3",
                                      code_type == "oper4", "opcs4")]

### Combine all codes
Keep only unique codes per individuals at the code's first occurance.

In [None]:
combined <- rbind(self_rep_illness, self_rep_oper, diag, oper)
combined <- codes[combined, on = c("code" = "code", "code_type" = "code_type"), allow.cartesian = TRUE]
combined <- combined[!is.na(Concept)]
combined <- combined[combined[, .I[which.min(date)], by = c("eid", "Concept")]$V1]

### Annotate the cohort with the codes

In [None]:
concepts <- unique(codes$Concept)
for (g in concepts) {

  col_name <- tolower(gsub(" ", "_", gsub("[()]","",g)))
  cohort[combined[Concept == g], paste0(col_name, c("", "_first_date")) := list(TRUE, as.Date(i.date)), on = "eid"]
  cohort[is.na(get(col_name)), (col_name) := FALSE]

}

### Remove withdrawals

In [None]:
cohort <- cohort[withdraw, withdrawal := TRUE, on = c("eid" = "V1")]
cohort[is.na(withdrawal), withdrawal := FALSE]