In [1]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib
import datetime
import subprocess
import numpy as np
import pandas as pd
import lifelines

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
today = '221109'

# Read all predictions

In [4]:

data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220627.feather").set_index("eid")

#endpoints_md = pd.read_csv(f"{experiment_path}/endpoints.csv")
#endpoints = sorted(endpoints_md.endpoint.to_list())
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])
endpoints_not_overlapping_with_preds = []
endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: 
        endpoints.append(c)

endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")


In [5]:
endpoint_defs.shape

(1171, 7)

In [6]:
# phecodes = ['phecode_979', 'phecode_202']
phecodes = endpoint_defs.index.values.tolist()
models = ['Age+Sex', 'Age+Sex+Retina']

In [7]:
all_preds = []
for phecode in tqdm(phecodes):
    model_preds = []
    for model in models:
        model_preds_endpoint = []
        for partition in range(0, 22):
            preds = pd.read_feather(f'{experiment_path}/coxph/predictions'\
                                    f'/{phecode}_{model}_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_{partition}.feather')
            preds = preds[['eid', 'endpoint', 'Ft_10']].rename({'Ft_10': model}, axis=1)
            model_preds_endpoint.append(preds)
        model_preds_endpoint = pd.concat(model_preds_endpoint, axis=0)
        model_preds.append(model_preds_endpoint)
        
    preds = model_preds[0].set_index(['eid', 'endpoint'])  
    for i in range(1, len(model_preds)):
        preds = preds.merge(model_preds[i].set_index(['eid', 'endpoint']), left_index=True, right_index=True)
    all_preds.append(preds)

  0%|          | 0/1171 [00:00<?, ?it/s]

In [8]:
all_preds = pd.concat(all_preds, axis=0)

In [9]:
all_preds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age+Sex,Age+Sex+Retina
eid,endpoint,Unnamed: 2_level_1,Unnamed: 3_level_1
1475840,OMOP_4306655,0.115656,0.04905
1475895,OMOP_4306655,0.017572,0.010447
1475910,OMOP_4306655,0.015858,0.009131
1475935,OMOP_4306655,0.029305,0.013193
1476077,OMOP_4306655,0.078034,0.037716


In [10]:
all_preds.eid.nunique()

AttributeError: 'DataFrame' object has no attribute 'eid'

In [None]:
## calculate relative risk
epsilon = 0.000001
all_preds['relative_risks'] = all_preds['Age+Sex+Retina'] / (all_preds['Age+Sex']+epsilon)

In [None]:
relative_risks = pd.pivot(all_preds.reset_index(), index='eid', columns='endpoint', values='relative_risks')

In [None]:
relative_risks.shape

In [None]:
relative_risks.reset_index().to_feather(f'{experiment_path}/relative_risks_{today}.feather')

In [None]:
f'{experiment_path}/relative_risks_{today}.feather'

# MAKE SELECTION ON SIGNIFICANT ENDPOINTS

In [None]:
relative_risks = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/relative_risks_221109.feather'
).set_index('eid')

In [None]:
relative_risks.head()

In [None]:
significant_df = pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/SupplTable2_SignificantEndpoints_CropRatio-0.66.csv')

In [None]:
significant_endpoints = significant_df.endpoint.values.tolist()

In [None]:
significant_endpoints

In [None]:
rr_significant = relative_risks[significant_endpoints]

In [None]:
rr_significant.head()

In [None]:
rr_significant.reset_index().to_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/relative_risks_significant_endpoints_221109.feather')

In [None]:
retina_eids = rr_significant.index.values.tolist()

### Verify the file contents

In [None]:
rr_significant = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/relative_risks_significant_endpoints_221109.feather')

In [None]:
rr_significant.isna().sum()

In [None]:
baseline_outcomes = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/baseline_outcomes_retina_221109.feather')

In [None]:
baseline_outcomes['phecode_325-1_prev'].sum()

## get the excusions for the same endpoints

In [None]:
endpoints_df = pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv', index_col=0)

In [None]:
endpoints_df.head()

In [None]:
baseline_outcomes = pd.read_feather(f"/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/baseline_outcomes_220627.feather")

In [None]:
baseline_outcomes = baseline_outcomes.query("eid==@retina_eids").set_index('eid')
baseline_outcomes.shape

In [None]:
baseline_outcomes.isna().sum().sum()

In [None]:
drop = [c for c in baseline_outcomes if '_time' in c]
baseline_outcomes = baseline_outcomes.drop(drop, axis=1)

In [None]:
baseline_outcomes.shape

In [None]:
outcomes_overall = pd.DataFrame([], index=baseline_outcomes.index.values)

for endpoint in rr_significant.columns:
    outcomes_overall[endpoint] = ((baseline_outcomes[f'{endpoint}_prev'] + baseline_outcomes[f'{endpoint}_event']) > 0)
    

In [None]:
outcomes_overall.head()

In [None]:
outcomes_overall.reset_index().rename({'index': 'eid'}, axis=1).to_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/outcomes_overall_221109.feather')

In [None]:
baseline_outcomes.reset_index().to_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/baseline_outcomes_retina_221109.feather')

# write aggregated predictions:

In [None]:
preds = []
for partition in range(0,22):
    p = pd.read_feather(f'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/{partition}/test.feather')
    preds.append(p)

In [None]:
len(preds)

In [None]:
preds = pd.concat(preds, axis=0)

In [None]:
preds.shape

In [None]:
preds.reset_index().to_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs_retina_221109.feather')

# Export to .tsv

In [None]:
files = [
    'baseline_outcomes_retina_221109.feather', #=> outcomes before baseline ('_prev' suffix) and outcomes after baseline ('_event' suffix)
    'loghs_retina_221109.feather', #=> retinal states
    'outcomes_overall_221109.feather', #=> outcomes overall: any('prev', 'event')
    'relative_risks_significant_endpoints_221109.feather' #=> risk_CPH(Age+Sex+Retina) / risk_CPH(Age+Sex)
]

In [None]:
for f in files:
    path = f'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/{f}'
    df = pd.read_feather(path).set_index('eid')
    out_path = f'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/{os.path.splitext(f)[0]}.tsv'
    df.to_csv(out_path, sep='\t')

In [None]:
/tmp/retinalrisk//tmp/retinalrisk/baseline_outcomes_retina_221109.feather 

# DO WE HAVE MISSING PHECODES??

In [4]:
asr_risk = pd.read_feather('')

In [6]:
# check if all selected are in the relative risk table:
selection = [
"phecode_164",
"phecode_417-2",
"phecode_665",
"phecode_205",
"phecode_387-2",
"phecode_374-51",
"phecode_367-5",
"phecode_374-42",
"phecode_582",
"phecode_542-1",
"phecode_396-1",
"phecode_005",
"phecode_059-1",
"phecode_287",
"phecode_236",
"phecode_733-6",
"phecode_114",
"phecode_337-8",
"phecode_498",
"phecode_812",
]

In [7]:
for p in selection:
    if not p  in x.columns:
        print(p)