In [1]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

“package ‘dtplyr’ was built under R version 4.0.3”
“package ‘tidyverse’ was built under R version 4.0.3”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

“package ‘ggplot2’ was built under R version 4.0.5”
“package ‘tibble’ was built under R version 4.0.5”
“package ‘tidyr’ was built under R version 4.0.5”
“package ‘readr’ was built under R version 4.0.5”
“package ‘purrr’ was built under R version 4.0.3”
“package ‘dplyr’ was built under R version 4.0.5”
“package ‘stringr’ was built under R version 4.0.5”
“package ‘forcats’ was built under R version 4.0.3”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[

In [2]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label = "22_retina_phewas"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = '221108'
experiment_path = glue("{output_path}/{experiment}")

[1] "/sc-projects/sc-proj-ukb-cvd"


In [18]:
dataset_path = "/sc-projects/sc-proj-ukb-cvd/data/3_datasets_post/210714_metabolomics"

In [3]:
today = '221109'

In [4]:
base_size = 8
title_size = 10
facet_size = 10
geom_text_size=3
theme_set(theme_classic(base_size = base_size) + 
          theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0), 
                strip.text.x = element_text(size = facet_size),axis.title=element_text(size=10), axis.text=element_text(size=8, color="black"),
                legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2), panel.grid.major=element_line()))

In [5]:
colors_dict = read_json("colors.json")
color_map <- c(
    "Age+Sex" = colors_dict$pastel$red$mid,
    "Retina" = colors_dict$pastel$red$mid,
    "Age+Sex+Retina" = colors_dict$pastel$red$mid,
    'MultiTaskSurvivalTraining_None_InceptionResnetV2+MLP_Identity' = colors_dict$pastel$red$mid
    #"Identity(AgeSex)+MLP" = colors_dict$pastel$red$mid,
    #"Identity(Records)+MLP" = colors_dict$pastel$red$mid,
    #"GNN(Records)+MLP" = colors_dict$pastel$red$mid,
    #"Identity(AgeSex+Records)+MLP" = colors_dict$pastel$red$mid,
    #"GNN(AgeSex+Records)+MLP" = colors_dict$pastel$red$mid
)

In [6]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [7]:
endpoint_map = endpoint_defs$phecode_string
names(endpoint_map) =  endpoint_defs$endpoint
endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint

In [8]:
endpoint_selection = c(
    # generally very important
    "phecode_202", # Diabetes mellitus
    "phecode_401",	#Hypertension"  
    "phecode_404", # Ischemic heart disease   
    "phecode_404-1", # Myocardial infarction [Heart attack]
    "phecode_431-11", # Cerebral infarction [Ischemic stroke]
    "phecode_424", # Heart failure

    
    "phecode_059-1", # COVID 19
    "phecode_468", # Pneumonia
    "phecode_474", # Chronic obstructive pulmonary disease [COPD]
      
    "phecode_286-2", #	Major depressive disorder
    "phecode_324-11", #Parkinson's Disease
    "phecode_328", # Dementias and cerebral degeneration

    
    "phecode_164", # Anemia
    "phecode_726-1", # Osteoporosis
    "phecode_371", # Cataract
    "phecode_374-42", # Diabetic retinopathy
    "phecode_374-5", # Macular degeneration
    "phecode_375-1", # Glaucoma
    
    
    "phecode_103", # Malignant neoplasm of the skin
    "phecode_101", # Malignant neoplasm of the digestive organs
    "phecode_102", # LUNG CANCER
    
    "phecode_583", # Chronic kidney disease    
    "phecode_542", # Chronic liver disease and sequelae    
    "OMOP_4306655" # All-Cause Death
    
    # also generally important and relevant
    #"phecode_440-3", # Pulmonary embolism
    #"phecode_468-1",	#Viral pneumonia
#     "phecode_460-2",	#Acute lower respiratory infection
    #"phecode_388" # Blindness and low vision
      # generally important and fun to check
   # "phecode_374-3", # Retinal vascular changes and occlusions
    #"phecode_665", # Psoriasis
#     "phecode_121", # Leukemia
    # important for eye
#     "phecode_705-1", # Rheumatoid arthritis
)

endpoints_common = c(
   'phecode_164', #Anemia
 'phecode_705-1', #Rheumatoid arthritis
 'phecode_328', #Dementias and cerebral degeneration
 'phecode_328-1', #Alzheimer's disease
 'phecode_401', #Hypertension
 'phecode_202', #Diabetes mellitus
 'phecode_416-21', #Atrial fibrillation
 'phecode_404-1', #Myocardial infarction [Heart attack]
 'phecode_424', #Heart failure
 'phecode_468', #Pneumonia
 'phecode_474', #Chronic obstructive pulmonary disease [COPD]
 'phecode_583', #Chronic kidney disease
 'OMOP_4306655' #All-Cause Death
    )
    
endpoints_cardio = c(
    'phecode_438-11',   #  "Abdominal aortic aneurysm",
    'phecode_440-3',#  "Pulmonary embolism", # intervention
    'phecode_413-21',#  "Aortic stenosis", # intervention
    'phecode_400'#  "Rheumatic fever and chronic rheumatic heart diseases",	
)

endpoints_eye= c(
    'phecode_374-5', #Macular degeneration
 'phecode_374-51', #Age-related macular degeneration
 'phecode_374-42', #Diabetic retinopathy
 'phecode_371', #Cataract
 'phecode_388', #Blindness and low vision
 'phecode_367-5', #Uveitis
 'phecode_389-1' #Ocular pain
)

In [9]:
endpoint_defs = endpoint_defs %>% 
    mutate(name = phecode_string) %>%
    mutate(name = 
           case_when( 
               phecode_string == "Myocardial infarction [Heart attack]"~"Myocardial infarction",
               phecode_string == "Cerebral infarction [Ischemic stroke]"~"Ischemic stroke",
               phecode_string == "Chronic obstructive pulmonary disease [COPD]"~"Chronic obstructive pulmonary disease",
               phecode_string == "Mitral valve insufficiency"~"Mitral insufficiency",
               phecode_string == "Parkinson's disease (Primary)"~"Parkinson's disease",
               phecode_string == "Suicide ideation and attempt or self harm"~"Suicide attempt",
               phecode_string == "Ischemic heart disease"~"Coronary heart disease",
               phecode_string == "Chronic kidney disease"~"Chronic kidney disease",
               phecode_string == "Rheumatic fever and chronic rheumatic heart diseases"~"Rheumatic heart disease",
               phecode_string == "Abdominal aortic aneurysm"~"Abdominal aortic aneurysm",
                  TRUE ~ name)
           )
            
endpoint_map = endpoint_defs$name
names(endpoint_map) =  endpoint_defs$endpoint
#endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint
endpoint_order = endpoint_selection

In [10]:
endpoints_cardio = c(
    'phecode_431-11', #  "Cerebral infarction [Ischemic stroke]",
    'phecode_404', #  "Ischemic heart disease",
    'phecode_404-1', #  "Myocardial infarction [Heart attack]", # intervention
    'phecode_424', #  "Heart failure", # intervention
     'OMOP_4306655', #  "All-Cause Death", # intervention
    'phecode_420' #  "Cardiac arrest", # intervention
   
)

## Load data

In [11]:
eligable_eids = arrow::read_feather(glue("{output_path}/eligable_eids_long_{today}.feather")) %>% 
    filter(endpoint %in% endpoint_selection) %>% 
    mutate(endpoint = as.character(endpoint)) %>%
    mutate(eid = as.numeric(as.character(eid))) %>%
    mutate(included = 1)

“Coercing dictionary values to R character factor levels”


In [12]:
data_outcomes = arrow::read_feather(glue("{output_path}/baseline_outcomes_long_220627.feather", as_data_frame=FALSE)) %>% 
    filter(endpoint %in% endpoint_selection) %>% left_join(eligable_eids, by=c("eid", "endpoint"))

In [13]:
partitions = 0:21
crop_ratio = "0.66" # 0.3, 0.5, 0.8
partitions
paths = c()
for (p in partitions){
    #temp_path = glue("{experiment_path}/loghs/Identity(Records)+MLP/{p}/test.feather")
    temp_path = glue("{experiment_path}/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio{crop_ratio}/{p}/test.feather")
    paths = c(paths, temp_path)
    }

In [14]:
predictions = paths %>% map_df(
    ~suppressWarnings(read_feather(., col_select=c("eid", all_of(endpoint_selection)))) %>% 
    pivot_longer(endpoint_selection, names_to="endpoint", values_to="logh") %>%
    mutate(eid = as.integer(as.character(eid))) %>%
    select(endpoint, eid, logh))# %>% arrange(endpoint, eid)
predictions %>% head()

Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(endpoint_selection)` instead of `endpoint_selection` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m


endpoint,eid,logh
<chr>,<int>,<dbl>
phecode_202,1475840,-0.41421801
phecode_401,1475840,0.97795552
phecode_404,1475840,-0.3677305
phecode_404-1,1475840,0.04391166
phecode_431-11,1475840,2.29117942
phecode_424,1475840,0.49617302


In [15]:
pred_outcomes = predictions %>% left_join(data_outcomes, on=c(eid, endpoint)) %>% as_tibble()

[1m[22mJoining, by = c("endpoint", "eid")


In [26]:
logh_inc = pred_outcomes %>% filter(included==1) %>% group_by(endpoint) %>% mutate(logh_perc = ntile(logh, 100)) %>% ungroup() %>% as_tibble()

In [19]:
data = arrow::read_feather(glue("{dataset_path}/data_merged.feather")) %>% filter(eid %in% unique(data_outcomes$eid))
data_description = arrow::read_feather(glue("{dataset_path}/description.feather"))

In [20]:
eids_with_retina = read_csv("/sc-projects/sc-proj-ukb-cvd/data/retina/eids.csv")

“Missing column names filled in: 'X1' [1]”

[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
cols(
  X1 = [32mcol_double()[39m,
  retina_eids = [32mcol_double()[39m
)



In [22]:
data = data %>% filter(eid %in% eids_with_retina$retina_eids) %>% 
    mutate(erectile_dysfunction = case_when(sex=="Female" ~ FALSE, TRUE ~ erectile_dysfunction))

In [23]:
data = data %>% mutate_at(c("sex", "overall_health_rating", "smoking_status", "ethnic_background"), as.factor)
data = data %>% mutate(sex=fct_relevel(sex, c("Male", "Female")),
                       overall_health_rating=fct_relevel(overall_health_rating, c("Excellent", "Good", "Fair", "Poor")),
                       smoking_status=fct_relevel(smoking_status, c("Current", "Previous", "Never")))

In [24]:
data %>% head()

eid,age_at_recruitment,sex,ethnic_background,townsend_deprivation_index_at_recruitment,date_of_attending_assessment_centre,uk_biobank_assessment_centre,birth_date,overall_health_rating,smoking_status,⋯,death_cvd_comp_event,death_cvd_comp_event_time,SCORE_comp_event,SCORE_comp_event_time,ASCVD_comp_event,ASCVD_comp_event_time,QRISK3_comp_event,QRISK3_comp_event_time,MACE_comp_event,MACE_comp_event_time
<int>,<dbl>,<fct>,<fct>,<dbl>,<date>,<chr>,<date>,<ord>,<ord>,⋯,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>
1000128,50,Female,White,2.53193,2010-02-11,Sheffield,1960-02-11,Good,Previous,⋯,0,11.1102,0,11.1102,0,11.1102,0,11.1102,0,11.1102
1000206,43,Male,White,-4.16695,2010-05-22,Sheffield,1967-05-22,Good,Previous,⋯,0,10.83641,0,10.83641,0,10.83641,0,10.83641,0,10.83641
1000212,64,Female,White,-0.0607392,2010-06-23,Sheffield,1946-06-23,Good,Never,⋯,0,10.7488,0,10.7488,0,10.7488,0,10.7488,0,10.7488
1000270,68,Female,White,-2.39726,2010-05-24,Sheffield,1942-05-24,Good,Never,⋯,0,10.83094,0,10.83094,0,10.83094,0,10.83094,0,10.83094
1000355,42,Male,White,-1.18005,2010-02-08,Sheffield,1968-02-08,Good,Never,⋯,0,11.11841,0,11.11841,0,11.11841,0,11.11841,0,11.11841
1000475,52,Female,White,0.142374,2010-06-03,Sheffield,1958-06-03,Fair,Never,⋯,0,10.80356,0,10.80356,0,10.80356,0,10.80356,0,10.80356


In [30]:
arrow::write_feather(data, './outputs/baseline_data.feather')

In [25]:
logh_inc %>% head()

endpoint,eid,logh,prev,event,time,included,logh_perc
<chr>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<dbl>,<int>
phecode_431-11,1475840,2.2911794,False,False,6.817389,1,6
phecode_424,1475840,0.496173,False,True,4.364134,1,5
phecode_059-1,1475840,1.981268,False,False,6.817389,1,4
phecode_468,1475840,-0.174394,False,False,6.817389,1,4
phecode_474,1475840,-0.3750526,False,False,6.817389,1,4
phecode_286-2,1475840,0.472837,False,False,6.817389,1,1


In [28]:
logh_inc %>% arrow::write_feather('./outputs/logh_inc.feather')

# PLOT THE IMG GRID

In [None]:
img = 

In [None]:
image_ggplot(img, interpolate = FALSE)