# Benchmarks

## Initialize

In [5]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

In [6]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = '220603_fullrun'
experiment_path = glue("{output_path}/{experiment}")

[1] "/sc-projects/sc-proj-ukb-cvd"


In [7]:
endpoints_md = fread(glue("{experiment_path}/endpoints.csv"), colClasses = c("phecode"="character")) %>% 
    select(-ICD10_only, -phecode_top, -leaf) %>% as_tibble()
endpoints = sort(endpoints_md$endpoint)

In [8]:
today = substr(Sys.time(), 0, 10) # YYYY-MM-DD
#today = '2022-07-05'

In [10]:
name = glue("benchmarks_cindex_{today}")
benchmark_endpoints = arrow::read_feather(glue("{experiment_path}/{name}.feather"))# %>% left_join(endpoint_defs) 

In [13]:
benchmark_clean = benchmark_endpoints %>% 
    filter(score %in% c("Age+Sex", "Age+Sex+Retina")) %>% 
    pivot_wider(names_from="score", values_from="cindex") %>% 
    mutate(delta=`Age+Sex+Retina`-`Age+Sex`)
benchmark_agg = benchmark_clean %>% group_by(endpoint) %>% summarise(across(all_of(c("Age+Sex+Retina", "Age+Sex", "delta")), ~ median(.)))

In [14]:
do_md = endpoints_md %>% #mutate(endpoint = as.character(endpoint)) %>%
    left_join(benchmark_agg) %>% as_tibble()

Joining, by = "endpoint"



In [15]:
## cardio endpoints with interventions

options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
do_md %>% 
    #filter(str_detect(phecode_category, "Cardio")) %>%
    #filter(str_detect(phecode_string, "aneurysm")) %>%
    filter(str_detect(phecode, "438")) %>%
    #filter(!str_detect(phecode, "\\.")) %>%
#filter(freq>0.001) %>%
    #filter(str_detect(phecode_string, "Embolism")) %>%
    #sample_n(10) 
    #filter(delta>0.05) %>% 
    arrange(desc(freq)) #%>% arrange(desc(delta))
    #arrange(desc(ratio))

V1,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,Age+Sex+Retina,Age+Sex,delta
<int>,<chr>,<int>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
578,phecode_438,61065,945,0.015475313,438.0,Aneurysm or ectasia,Cardio,Both,0.6289684,0.6281651,0.0008188171
579,phecode_438-1,61179,667,0.010902434,438.1,Aortic aneurysm and ectasia,Cardio,Both,0.6524617,0.6516767,0.0028204549
580,phecode_438-11,61217,448,0.007318229,438.11,Abdominal aortic aneurysm,Cardio,Both,0.6279994,0.6271213,0.0003593716
581,phecode_438-12,61240,179,0.002922926,438.12,Thoracic aneurysm,Cardio,Both,0.6965395,0.6993533,0.0024544229


In [16]:
unique(do_md$phecode_category)

In [17]:
endpoint_selection_order = c(
    
        "Hypertension", # intervention
        "Diabetes mellitus", # intervention
        "Atrial fibrillation", # intervention
        "Ischemic heart disease",
        "Myocardial infarction [Heart attack]", # intervention
        "Cerebral infarction [Ischemic stroke]",
        "Heart failure", # intervention
        "Pneumonia", # intervention
        "Chronic obstructive pulmonary disease [COPD]", # interventio
        "Chronic kidney disease", # intervention
        "Cardiac arrest", # intervention
        "All-Cause Death", # intervention
                
        "Aortic stenosis", # intervention
        "Mitral valve insufficiency",
        "Endocarditis",
        "Pulmonary embolism", # intervention
        "Abdominal aortic aneurysm",
        "Rheumatic fever and chronic rheumatic heart diseases",	
        
        "Back pain", # intervention
        "Anemia", # intervention
        "Rheumatoid arthritis", # NEW + interventio
        "Psoriasis", # interesting
        "Parkinson's disease (Primary)",
        "Suicide ideation and attempt or self harm" # intervention
)

endpoint_selection = do_md %>% 
    filter(phecode_string %in% endpoint_selection_order) %>%
    arrange(as.numeric(phecode)) %>%
    mutate(phecode_string=factor(phecode_string, levels=endpoint_selection_order)) %>% 
    arrange(phecode_string)
endpoint_selection

V1,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,Age+Sex+Retina,Age+Sex,delta
<int>,<chr>,<int>,<int>,<dbl>,<chr>,<fct>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
499,phecode_401,48533,11114,0.228998826,401.0,Hypertension,Cardio,Both,0.5535163,0.5518167,0.0019350253
185,phecode_202,57936,4259,0.073512151,202.0,Diabetes mellitus,Endo,Both,0.580673,0.5697015,0.0098697034
534,phecode_416-21,60476,2707,0.044761558,416.21,Atrial fibrillation,Cardio,Both,0.6022074,0.6004052,0.0012951202
505,phecode_404,58791,4055,0.068973142,404.0,Ischemic heart disease,Cardio,Both,0.6418495,0.6388546,0.0031841236
506,phecode_404-1,60213,1800,0.029893877,404.1,Myocardial infarction [Heart attack],Cardio,Both,0.6721406,0.6682512,0.0045172227
569,phecode_431-11,60849,1157,0.019014281,431.11,Cerebral infarction [Ischemic stroke],Cardio,Both,0.6925608,0.6937889,-0.0017950705
557,phecode_424,60846,1996,0.032804128,424.0,Heart failure,Cardio,Both,0.6613509,0.6548336,0.0066551176
626,phecode_468,59697,3199,0.053587282,468.0,Pneumonia,Resp,Both,0.5895169,0.5841629,0.0055658121
640,phecode_474,60402,2464,0.040793351,474.0,Chronic obstructive pulmonary disease [COPD],Resp,Both,0.6267931,0.6129847,0.0123509432
804,phecode_583,59363,3264,0.054983744,583.0,Chronic kidney disease,Genitourinary,Both,0.5856528,0.585135,0.0014269716


In [18]:
cat(paste0("'", endpoint_selection$endpoint, "'\n"))

'phecode_401'
 'phecode_202'
 'phecode_416-21'
 'phecode_404'
 'phecode_404-1'
 'phecode_431-11'
 'phecode_424'
 'phecode_468'
 'phecode_474'
 'phecode_583'
 'phecode_420'
 'OMOP_4306655'
 'phecode_413-21'
 'phecode_413-11'
 'phecode_410-2'
 'phecode_440-3'
 'phecode_438-11'
 'phecode_400'
 'phecode_718'
 'phecode_164'
 'phecode_705-1'
 'phecode_665'
 'phecode_324-11'
 'phecode_284'


In [19]:
unique(do_md$phecode_category)

In [20]:
do_md %>% 
    #filter(leaf==0) %>%
    filter(delta<0.02) %>%
    arrange(delta)# %>% 
    #filter(str_detect(phecode_string, "ardio"))
    #filter(phecode_category == "Resp")

V1,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,Age+Sex+Retina,Age+Sex,delta
<int>,<chr>,<int>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
394,phecode_366-1,61112,113,0.001849064,366.1,Pterygium of eye,Eye,Both,0.3504817,0.4125555,-0.081169190
340,phecode_337-8,61219,169,0.002760581,337.8,Polyneuropathy in diseases classified elsewhere,Neuro,Both,0.6297963,0.7040628,-0.079718580
113,phecode_139-52,61215,117,0.001911296,139.52,Lipoma of intrathoracic organs,Neoplasms,Both,0.3403283,0.4173101,-0.065696462
451,phecode_381-1,61202,132,0.002156792,381.1,Paralytic strabismus [Neurogenic strabismus],Eye,Both,0.5829130,0.6323353,-0.053401728
454,phecode_385,61248,186,0.003036834,385,Abnormal results of function studies of eye,Eye,Both,0.5547616,0.5941478,-0.049246899
736,phecode_525-1,60962,225,0.003690824,525.1,Celiac disease,GI,Both,0.4909854,0.5360803,-0.044395669
138,phecode_162,61227,151,0.002466232,162,Aplastic anemia,Blood,Both,0.5851693,0.6239057,-0.042995815
769,phecode_542-3,61223,153,0.002499061,542.3,Hepatic failure,GI,Both,0.6214744,0.6629501,-0.041705205
1156,phecode_841-5,61219,105,0.001715154,841.5,Allergy to serum and vaccine,Signs/Symptoms,Both,0.5233329,0.5701381,-0.040527439
836,phecode_596-5,61132,127,0.002077472,596.5,Neuromuscular dysfunction of bladder,Genitourinary,Both,0.5747661,0.6198214,-0.040512146
