In [19]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib
import datetime
import subprocess
import numpy as np
import pandas as pd
import lifelines

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [21]:
today = '221109'

# Read all predictions

In [41]:

data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220627.feather").set_index("eid")

#endpoints_md = pd.read_csv(f"{experiment_path}/endpoints.csv")
#endpoints = sorted(endpoints_md.endpoint.to_list())
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])
endpoints_not_overlapping_with_preds = []
endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: 
        endpoints.append(c)

endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")


In [43]:
endpoint_defs.shape

(1171, 7)

In [44]:
# phecodes = ['phecode_979', 'phecode_202']
phecodes = endpoint_defs.index.values.tolist()
models = ['Age+Sex', 'Age+Sex+Retina']

In [45]:
all_preds = []
for phecode in tqdm(phecodes):
    model_preds = []
    for model in models:
        model_preds_endpoint = []
        for partition in range(0, 22):
            preds = pd.read_feather(f'{experiment_path}/coxph/predictions'\
                                    f'/{phecode}_{model}_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_{partition}.feather')
            preds = preds[['eid', 'endpoint', 'Ft_10']].rename({'Ft_10': model}, axis=1)
            model_preds_endpoint.append(preds)
        model_preds_endpoint = pd.concat(model_preds_endpoint, axis=0)
        model_preds.append(model_preds_endpoint)
        
    preds = model_preds[0].set_index(['eid', 'endpoint'])  
    for i in range(1, len(model_preds)):
        preds = preds.merge(model_preds[i].set_index(['eid', 'endpoint']), left_index=True, right_index=True)
    all_preds.append(preds)

  0%|          | 0/1171 [00:00<?, ?it/s]

In [46]:
all_preds = pd.concat(all_preds, axis=0)

In [47]:
all_preds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age+Sex,Age+Sex+Retina
eid,endpoint,Unnamed: 2_level_1,Unnamed: 3_level_1
1475840,OMOP_4306655,0.115656,0.04905
1475895,OMOP_4306655,0.017572,0.010447
1475910,OMOP_4306655,0.015858,0.009131
1475935,OMOP_4306655,0.029305,0.013193
1476077,OMOP_4306655,0.078034,0.037716


In [48]:
## calculate relative risk
epsilon = 0.000001
all_preds['relative_risks'] = all_preds['Age+Sex+Retina'] / (all_preds['Age+Sex']+epsilon)

In [49]:
relative_risks = pd.pivot(all_preds.reset_index(), index='eid', columns='endpoint', values='relative_risks')

In [54]:
relative_risks.shape

(61256, 1171)

In [52]:
relative_risks.reset_index().to_feather(f'{experiment_path}/relative_risks_{today}.feather')

In [53]:
f'{experiment_path}/relative_risks_{today}.feather'

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/relative_risks_221109.feather'

In [10]:
preds = pd.read_feather(f'{experiment_path}/coxph/predictions'\
                                    f'/phecode_979_Age+Sex_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_0.feather')


In [13]:
preds.head()

Unnamed: 0,eid,endpoint,Age+Sex
0,1475840,phecode_979,0.003617
1,1475895,phecode_979,0.001436
2,1475910,phecode_979,0.001417
3,1475935,phecode_979,0.00153
4,1476077,phecode_979,0.003437


In [14]:
x = preds.copy()
x['TEST'] = 0
x.drop('Age+Sex', axis=1, inplace=True)

In [18]:
preds.set_index(['eid', 'endpoint']).

Unnamed: 0_level_0,Unnamed: 1_level_0,Age+Sex,TEST
eid,endpoint,Unnamed: 2_level_1,Unnamed: 3_level_1
1475840,phecode_979,0.003617,0
1475895,phecode_979,0.001436,0
1475910,phecode_979,0.001417,0
1475935,phecode_979,0.001530,0
1476077,phecode_979,0.003437,0
...,...,...,...
1917524,phecode_979,0.001611,0
1917552,phecode_979,0.001550,0
1917606,phecode_979,0.001347,0
1917621,phecode_979,0.003025,0


In [5]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220627.feather").set_index("eid")
data_outcomes = data_outcomes[[c for c in data_outcomes.columns if "_event" in c and c[:-6] in endpoints]]

In [6]:
data_records = pd.read_feather(f"{output_path}/baseline_records_220627.feather").set_index("eid")

In [7]:
data_records = data_records[[c for c in tqdm(data_records.columns.to_list()) if "OMOP_" in c]]

  0%|          | 0/73871 [00:00<?, ?it/s]

In [8]:
records = data_records.columns.to_list()

In [9]:
data_all = data_records.merge(data_outcomes, left_index=True, right_index=True, how="left")

In [10]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_2022-07-01.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [11]:
record_freqs = data_records.sum().sort_values(ascending=False).pipe(lambda x: x[x>=50])
record_freqs

OMOP_4081598    307739
OMOP_4052351    270116
OMOP_4061103    263319
OMOP_4144272    247882
OMOP_4057411    221203
                 ...  
OMOP_4039277        50
OMOP_4116240        50
OMOP_4050692        50
OMOP_4209141        50
OMOP_4171619        50
Length: 15595, dtype: int64

In [4]:
endpoints_md = fread(glue("{experiment_path}/endpoints.csv"), colClasses=c("phecode"="character"))
endpoints = sort(endpoints_md$endpoint)

In [5]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [6]:
endpoint_selection = c(
    # generally very important
    "phecode_202", # Diabetes mellitus
    "phecode_404", # Ischemic heart disease
    "phecode_404-1", # Myocardial infarction [Heart attack]
    "phecode_431-11", # Cerebral infarction [Ischemic stroke]
    "phecode_424", # Heart failure
    "OMOP_4306655", # All-Cause Death
    # also generally important and relevant
    #"phecode_440-3", # Pulmonary embolism
    #"phecode_468-1",	#Viral pneumonia
    "phecode_401",	#Hypertension"
    "phecode_460-2",	#Acute lower respiratory infection
    "phecode_468", # Pneumonia
    "phecode_474", # Chronic obstructive pulmonary disease [COPD]
    "phecode_542", # Chronic liver disease and sequelae
    "phecode_583", # Chronic kidney disease
    "phecode_328", # Dementias and cerebral degeneration
    
    
    # generally important and fun to check
    "phecode_164", # Anemia
   # "phecode_726-1", # Osteoporosis
    "phecode_286-2", #	Major depressive disorder
    "phecode_103", # Malignant neoplasm of the skin
    "phecode_101", # Malignant neoplasm of the digestive organs
    #"phecode_665", # Psoriasis
    "phecode_121", # Leukemia
    "phecode_705-1", # Rheumatoid arthritis
    
    
    # important for eye
    "phecode_371", # Cataract
   # "phecode_374-3", # Retinal vascular changes and occlusions
    "phecode_374-42", # Diabetic retinopathy
    "phecode_374-5", # Macular degeneration
    "phecode_375-1" # Glaucoma
    #"phecode_388" # Blindness and low vision
)

endpoints_common = c(
   'phecode_164', #Anemia
 'phecode_705-1', #Rheumatoid arthritis
 'phecode_328', #Dementias and cerebral degeneration
 'phecode_328-1', #Alzheimer's disease
 'phecode_401', #Hypertension
 'phecode_202', #Diabetes mellitus
 'phecode_416-21', #Atrial fibrillation
 'phecode_404-1', #Myocardial infarction [Heart attack]
 'phecode_424', #Heart failure
 'phecode_468', #Pneumonia
 'phecode_474', #Chronic obstructive pulmonary disease [COPD]
 'phecode_583', #Chronic kidney disease
 'OMOP_4306655' #All-Cause Death
    )
    
endpoints_cardio = c(
    'phecode_438-11',   #  "Abdominal aortic aneurysm",
    'phecode_440-3',#  "Pulmonary embolism", # intervention
    'phecode_413-21',#  "Aortic stenosis", # intervention
    'phecode_400'#  "Rheumatic fever and chronic rheumatic heart diseases",	
)

endpoints_eye= c(
    'phecode_374-5', #Macular degeneration
 'phecode_374-51', #Age-related macular degeneration
 'phecode_374-42', #Diabetic retinopathy
 'phecode_371', #Cataract
 'phecode_388', #Blindness and low vision
 'phecode_367-5', #Uveitis
 'phecode_389-1' #Ocular pain
)

In [7]:
endpoint_defs = endpoint_defs %>% 
    mutate(name = phecode_string) %>%
    mutate(name = 
           case_when( 
               phecode_string == "Myocardial infarction [Heart attack]"~"Myocardial infarction",
               phecode_string == "Cerebral infarction [Ischemic stroke]"~"Ischemic stroke",
               phecode_string == "Chronic obstructive pulmonary disease [COPD]"~"Chronic obstructive pulmonary disease",
               phecode_string == "Mitral valve insufficiency"~"Mitral insufficiency",
               phecode_string == "Parkinson's disease (Primary)"~"Parkinson's disease",
               phecode_string == "Suicide ideation and attempt or self harm"~"Suicide attempt",
               phecode_string == "Ischemic heart disease"~"Coronary heart disease",
               phecode_string == "Chronic kidney disease"~"Chronic kidney disease",
               phecode_string == "Rheumatic fever and chronic rheumatic heart diseases"~"Rheumatic heart disease",
               phecode_string == "Abdominal aortic aneurysm"~"Abdominal aortic aneurysm",
                  TRUE ~ name)
           )
            
endpoint_map = endpoint_defs$name
names(endpoint_map) =  endpoint_defs$endpoint
#endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint
endpoint_order = endpoint_selection

In [8]:
str_replace_all(endpoint_selection, "\\-", "\\.")

In [9]:
endpoints_md %>% filter(endpoint %in% endpoint_selection) %>% as_tibble() %>% arrange(n)  %>%
    mutate(endpoint = recode(endpoint, !!!endpoint_map)) %>% mutate(perc = freq*100)

V1,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf,perc
<int>,<chr>,<int>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<dbl>,<dbl>
86,Leukemia,61142,263,0.004301462,121.0,Leukemia,Neoplasms,Both,0.0,121.0,0.0,0.4301462
998,Rheumatoid arthritis,60634,820,0.013523766,705.1,Rheumatoid arthritis,Musc/Skel,Both,0.0,705.0,0.0,1.3523766
297,Dementias and cerebral degeneration,61233,968,0.015808469,328.0,Dementias and cerebral degeneration,Neuro,Both,0.0,328.0,0.0,1.5808469
568,Ischemic stroke,60849,1157,0.019014281,431.11,Cerebral infarction [Ischemic stroke],Cardio,Both,0.0,431.0,1.0,1.9014281
41,Malignant neoplasm of the digestive organs,60713,1652,0.027209988,101.0,Malignant neoplasm of the digestive organs,Neoplasms,Both,0.0,101.0,0.0,2.7209988
430,Glaucoma,60486,1664,0.027510498,375.1,Glaucoma,Eye,Both,0.0,375.0,0.0,2.7510498
421,Diabetic retinopathy,60400,1760,0.029139073,374.42,Diabetic retinopathy,Eye,Both,0.0,374.0,1.0,2.9139073
505,Myocardial infarction,60213,1800,0.029893877,404.1,Myocardial infarction [Heart attack],Cardio,Both,0.0,404.0,0.0,2.9893877
422,Macular degeneration,60918,1816,0.029810565,374.5,Macular degeneration,Eye,Both,0.0,374.0,0.0,2.9810565
556,Heart failure,60846,1996,0.032804128,424.0,Heart failure,Cardio,Both,0.0,424.0,0.0,3.2804128


In [10]:
#today = substr(Sys.time(), 0, 10) # YYYY-MM-DD
today = '220824'

In [11]:
eligable_eids = arrow::read_feather(glue("{output_path}/eligable_eids_long_{today}.feather")) %>% 
    filter(endpoint %in% endpoint_selection) %>% 
    mutate(endpoint = as.character(endpoint)) %>%
    mutate(eid = as.numeric(as.character(eid))) %>%
    mutate(included = 1)

“Coercing dictionary values to R character factor levels”


In [12]:
data_outcomes = arrow::read_feather(glue("{output_path}/baseline_outcomes_long_220627.feather", as_data_frame=FALSE)) %>% 
    filter(endpoint %in% endpoint_selection) %>% left_join(eligable_eids, by=c("eid", "endpoint"))