## HERMES - furosemide phenotype

#### Clone HeRmes repository

In [5]:
#system("git clone https://github.com/nicksunderland/heRmes.git")

#### Pushing changes to Github

In [6]:
# in ther terminal run
# cd /opt/notebooks
# dx download git_push.sh
# run bash git_push.sh 

#### Project & record ID

In [7]:
projectid <- "project-GvZyZ20J81vgPJGbJy8pgpyq"
recordid  <- "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"

#### Libraries

In [8]:
library(glue)
library(data.table)

## Extract GP data

#### Download data dictionary

In [9]:
setwd("/opt/notebooks")
dataset <- glue("{projectid}:{recordid}")
cmd <- glue("dx extract_dataset {dataset} -ddd")
system(cmd)
dict_files <- list.files(pattern="codings|data_dictionary|entity_dictionary")
data_dict_file <- dict_files[grepl("data_dictionary", dict_files)]

#### Data dictionary filter function

In [10]:
#' @title filter_data_dict
#'
#' @param dict_path, str, path to the dataset.data_dictionary.csv
#' @param codes_str, list, list of lists representing UKBB column name, table entity, and search strategy list(name=, entity=, search=). 
#'   name must be a valid column name in the data_dictionary, entity a valid entity in the entity dictionary, and search either "matches"
#'   for exact matches, or starts with to match cases of multiple instances (repeated measures usually)
#'
#' @returns a filtered subset of the data_dictionary 
#'
filter_data_dict <- function(dict_path, codes_struc) {
    
    data_dict <- fread(dict_path)
    
    d <- lapply(codes_struc, function(x) {
        
        d0 <- data.table()
        if (x$search=="matches") {
            d0 <- data_dict[entity==x$entity & name==x$name]
        } else if (x$search=="startswith") {
            d0 <- data_dict[entity==x$entity & grepl(paste0("^", x$name), name)]
        }
        
        if (nrow(d0)==0) {
            cat(glue("Code [{x$name}] not found in data dictionary\n"))
            stop("Code not found error")
        }
        
        d0
        
    }) |> rbindlist(idcol = "item")
    
    return(d)
}

#### Data extraction function

In [11]:
#' @title extract_data
#'
#' @param dataset, str, a valid dataset id - format "{projectid}:{recordid}" 
#' @param fields, str, vector of UK-BB format column names e.g. p31
#' @param entity, str, string of length one - the entity to extract from e.g. participants
#' @param output, str, the base name for the output file, no extension
#'
#' @returns NULL side effect is starting a table-exporter job which outputs the file to /hermes3_data directory in the RAP
#'
extract_data <- function(dataset, fields, entity, output) {
    
    field_str <- paste0('-ifield_names="', fields, '"', collapse=" ") 
    
    cmd <- glue(
      "dx run table-exporter ",
      "-idataset_or_cohort_or_dashboard={dataset} ",
      "-ioutput={output} ",
      "-ioutput_format=TSV ",
      "-iheader_style=FIELD-NAME ",
      "-icoding_option=RAW ",
      "{field_str} ",
      "-ientity={entity} ",
      "--destination hermes3_data/"
    )    

    o <- system(cmd, intern = TRUE)
    cat(o, sep = "\n")
}

#### Define GP clinical data

In [12]:
gp_clinical_codes= list(eid           = list(name="eid",           entity="gp_clinical", search="matches"),
                        data_provider = list(name="data_provider", entity="gp_clinical", search="matches"),
                        date          = list(name="event_dt",      entity="gp_clinical", search="matches"),
                        read_2        = list(name="read_2",        entity="gp_clinical", search="matches"),
                        read_3        = list(name="read_3",        entity="gp_clinical", search="matches"),
                        value1        = list(name="value1",        entity="gp_clinical", search="matches"),
                        value2        = list(name="value2",        entity="gp_clinical", search="matches"),
                        value3        = list(name="value3",        entity="gp_clinical", search="matches"))

gp_clinical_data_dict = filter_data_dict(data_dict_file, gp_clinical_codes)
head(gp_clinical_data_dict, 3)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,gp_clinical,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,
data_provider,gp_clinical,data_provider,string,,data_coding_626,,,,,,,,,,Data provider,
date,gp_clinical,event_dt,date,,data_coding_819,,,,,yes,,,,,Date clinical code was entered,


#### Define GP medications data

In [13]:
gp_medication_codes= list(eid           = list(name="eid",           entity="gp_scripts", search="matches"),
                          data_provider = list(name="data_provider", entity="gp_scripts", search="matches"),
                          date          = list(name="issue_date",    entity="gp_scripts", search="matches"),
                          read_2        = list(name="read_2",        entity="gp_scripts", search="matches"),
                          bnf_code      = list(name="bnf_code",      entity="gp_scripts", search="matches"),
                          dmd_code      = list(name="dmd_code",      entity="gp_scripts", search="matches"),
                          drug_name     = list(name="drug_name",     entity="gp_scripts", search="matches"),
                          quantity      = list(name="quantity",      entity="gp_scripts", search="matches"))

gp_medication_data_dict = filter_data_dict(data_dict_file, gp_medication_codes)
head(gp_medication_data_dict, 3)

item,entity,name,type,primary_key_type,coding_name,concept,description,folder_path,is_multi_select,is_sparse_coding,linkout,longitudinal_axis_type,referenced_entity_field,relationship,title,units
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
eid,gp_scripts,eid,string,,,,,,,,,,participant:eid,many_to_one,Participant ID,
data_provider,gp_scripts,data_provider,string,,data_coding_626,,,,,,,,,,Data provider,
date,gp_scripts,issue_date,date,,data_coding_819,,,,,yes,,,,,Date prescription was issued,


#### Run Table-Exporter extraction

In [16]:
data_file_paths <- list(
    gp_clinical   = "/mnt/project/hermes3_data/data_gp_clinical.tsv",
    gp_medication = "/mnt/project/hermes3_data/data_gp_medication.tsv"
)

if (!file.exists(data_file_paths$gp_clinical)) {
    extract_data(dataset=dataset, fields=gp_clinical_data_dict$name, entity="gp_clinical", output = "data_gp_clinical")
}
if (!file.exists(data_file_paths$gp_medication)) {
    extract_data(dataset=dataset, fields=gp_medication_data_dict$name, entity="gp_scripts", output = "data_gp_medication")
}

## Read in extracted data

In [None]:
data_files <- list()

for (i in seq_along(data_file_paths)) {
    if (!file.exists(data_file_paths[[i]])) {
        cat(glue("Error:\nFile {basename(data_file_paths[[i]])} not found, ",
                 "check the Monitor tab for the status of the Table-exporter ",
                 "and the 'hermes_data' folder. If this has finished try ",
                 "launching another Notebook session/instance (I'm not sure why ",
                 "the mounted /mnt/project/ file structure doesn't refresh when ",
                 "files are added externally."))
        stop("file not found error")
    }
    f <- data_file_paths[[i]]
    n <- names(data_file_paths)[i]
    cat(glue('...{n}: {f}\n'), sep="\n")
    flush.console()
    data_files[[n]] <- fread(f)
}

lapply(data_files, head, n = 5)

...gp_clinical: /mnt/project/hermes3_data/data_gp_clinical.tsv


## Rename columns

In [None]:
rename_cols <- function(d, code_struc) {
    for (col in names(code_struc)) {
        if (code_struc[[col]]$search=="matches") {
            setnames(d, code_struc[[col]]$name, col)
        } else if (code_struc[[col]]$search=="startswith") {
            regex     <- paste0("^", code_struc[[col]]$name)
            matches   <- names(d)[grepl(regex, names(d))]
            new_names <- paste0(col, "_", 1:length(matches))
            setnames(d, matches, new_names)
        }
    }
    return(d)
}

data_files$gp_clinical, code_struc=gp_clinical_codes)
data_files$gp_medication,  code_struc=gp_medication_codes)

lapply(data_files, head, n = 5)