# HERMES 3.0 UK Biobank Phenotyping

### Libraries

In [2]:
system("pip3 show dxpy", intern = TRUE)
library(glue)
library(data.table)

### Project & dataset IDs

In [3]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
rid       <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"
dataset   <- glue("{projectid}:{rid}")
cat("Project record ID:", dataset, "\n")

## Data extraction

### Download data dictionary

In [23]:
setwd("/opt/notebooks")
cmd <- glue("dx extract_dataset {dataset} -ddd")
system(cmd)
dict_files <- list.files(pattern="codings|data_dictionary|entity_dictionary")
data_dict_file <- dict_files[grepl("data_dictionary", dict_files)]

#### Data dictionary filter function

In [24]:
filter_data_dict <- function(dict_path, codes_str) {
    
    d <- lapply(codes_str, function(x) {
        
        if (x$search=="matches") {
            data_dict[entity==x$entity & name==x$name]
        } else if (x$search=="startswith") {
            data_dict[entity==x$entity & grepl(paste0("^", x$name), name)]
        }
        
    }) |> rbindlist(idcol = "item")
    
    d[, field_id := paste(entity, name, sep=".")]
    
    return(d)
}

### Extract participant data

In [25]:
codes_to_extract = list(eid                = list(name="eid",       entity="participant", search="matches"),
                        reason_lost_fu     = list(name="p190",      entity="participant", search="matches"),
                        sex                = list(name="p31",       entity="participant", search="matches"),
                        age                = list(name="p21022",    entity="participant", search="matches"),
                        ethnicity          = list(name="p21000",    entity="participant", search="matches"),
                        genetic_sex        = list(name="p22001",    entity="participant", search="matches"),
                        genetic_ethnicity  = list(name="p22006",    entity="participant", search="matches"),
                        pc1                = list(name="p22009_a1", entity="participant", search="matches"),
                        pc2                = list(name="p22009_a2", entity="participant", search="matches"),
                        pc3                = list(name="p22009_a3", entity="participant", search="matches"),
                        pc4                = list(name="p22009_a4", entity="participant", search="matches"),
                        pc5                = list(name="p22009_a5", entity="participant", search="matches"),
                        self_rep_ill       = list(name="p20002",    entity="participant", search="startswith"),
                        self_rep_ill_year  = list(name="p20008",    entity="participant", search="startswith"),
                        self_rep_proc      = list(name="p20004",    entity="participant", search="startswith"),
                        self_rep_proc_year = list(name="p20010",    entity="participant", search="startswith"))

filt_data_dict = filter_data_dict(data_dict_file, codes_to_extract)
head(filt_data_dict, 3)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,participant,eid,string,global,,,,Participant Information,,,,,,,Participant ID,,participant.eid
sex,participant,p31,integer,,data_coding_9,,,Population characteristics > Baseline characteristics,,,http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=31,,,,Sex,,participant.p31
age,participant,p21022,integer,,,,,Population characteristics > Baseline characteristics,,,http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=21022,,,,Age at recruitment,years,participant.p21022


In [15]:
cmd <- glue("dx extract_dataset {dataset} --fields {paste(filt_data_dict$field_id[1:5], collapse=',')} --delimiter ',' -o data_participant.csv")
system(cmd)

In [None]:
data_df <- fread("cohort_data.csv")
head(data_df, 3)

### Extract HES inpatient data

In [30]:
codes_to_extract = list(eid                = list(name="eid",       entity="hesin", search="matches"),
                        ins_index          = list(name="ins_index", entity="hesin", search="matches"),
                        epistart           = list(name="epistart",  entity="hesin", search="matches"),
                        admidate           = list(name="admidate",  entity="hesin", search="matches"))

filt_data_dict = filter_data_dict(data_dict_file, codes_to_extract)
head(filt_data_dict, 4)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,hesin,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,,hesin.eid
ins_index,hesin,ins_index,integer,,,,,,,,,,,,Instance index,,hesin.ins_index
epistart,hesin,epistart,date,,,,,,,,,,,,Episode start date,,hesin.epistart
admidate,hesin,admidate,date,,,,,,,,,,,,Date of admission to hospital,,hesin.admidate


In [31]:
cmd <- glue("dx extract_dataset {dataset} --fields {paste(filt_data_dict$field_id, collapse=',')} --delimiter ',' -o data_hesin.csv")
system(cmd)

### Extract HES diagnoses

In [34]:
codes_to_extract = list(eid                = list(name="eid",       entity="hesin_diag", search="matches"),
                        ins_index          = list(name="ins_index", entity="hesin_diag", search="matches"),
                        epistart           = list(name="epistart",  entity="hesin_diag", search="matches"),
                        admidate           = list(name="admidate",  entity="hesin_diag", search="matches"))

filt_data_dict = filter_data_dict(data_dict_file, codes_to_extract)
head(filt_data_dict, 4)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units,field_id
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
eid,hesin_diag,eid,string,,,,,,,,,,,,Participant ID,,hesin_diag.eid
ins_index,hesin_diag,ins_index,integer,,,,,,,,,,,,Instance index,,hesin_diag.ins_index
epistart,hesin_diag,diag_icd9,string,,data_coding_87,,,,,,,,,,Diagnoses - ICD9,,hesin_diag.diag_icd9
admidate,hesin_diag,diag_icd10,string,,data_coding_19,,,,,,,,,,Diagnoses - ICD10,,hesin_diag.diag_icd10


In [35]:
cmd <- glue("dx extract_dataset {dataset} --fields {paste(filt_data_dict$field_id, collapse=',')} --delimiter ',' -o data_hesin_diag.csv")
system(cmd)

### Extract HES procedures

In [None]:
codes_to_extract = list(eid                = list(name="eid",       entity="hesin_oper", search="matches"),
                        ins_index          = list(name="ins_index", entity="hesin_oper", search="matches"),
                        epistart           = list(name="oper3",     entity="hesin_oper", search="matches"),
                        admidate           = list(name="oper4",     entity="hesin_oper", search="matches"))

In [None]:
cmd <- glue("dx extract_dataset {dataset} --fields {paste(filt_data_dict$field_id, collapse=',')} --delimiter ',' -o data_hesin_oper.csv")
system(cmd)

## Data processing