In [None]:
%load_ext autoreload
%autoreload 2
%aimport -raw_data_preprocessing -pandas -numpy

In [None]:
import os
import sys

import pandas as pd
import numpy as np

sys.path.append("../")
sys.path.append("../ukbb_preprocessing/")

from raw_data_preprocessing.raw_data_loader import raw_data_loader
from raw_data_preprocessing.constants import *
from utils import rename_variables, DataRegisterer, percentile, save_results

TEST_EXAMPLE = False
loader = raw_data_loader()
registerer = DataRegisterer()

In [None]:
if TEST_EXAMPLE:
    # TODO get example from manuscript, if any
    pass
else:
    if os.path.exists("cache/df.pkl"):
        print("Loading cached df")
        df = pd.read_pickle("cache/df.pkl")
    else:
        data_asset = loader.ws.data.get(name="pheno_recvd", version="2")
        df = pd.read_csv(data_asset.path)
        df = df.rename(columns={"eid": "IID"})
        df.index = df.IID.astype(int)
        df.to_pickle("cache/df.pkl")

    if os.path.exists("cache/processed_df.pkl"):
        print("Loading cached processed_df")
        smart_df = pd.read_pickle("cache/processed_df.pkl")
    else:
        smart_df = rename_variables(df)
        smart_df.to_pickle("cache/processed_df.pkl")

In [None]:
def match_codes(code_list, diagnosis_list):

    for diagnosis in diagnosis_list:
        for code in code_list:
            if diagnosis.startswith(code):
                return True

    return False

# from https://academic.oup.com/eurjpc/advance-article/doi/10.1093/eurjpc/zwae352/7849694?login=true
# Study design

# from bud
cerebrovascular_disease_icd10 = [
    "G45", "G450", "G451", "G452", "G453", "G454", "G458",
    "G459", "I60", "I600", "I601", "I602", "I603", "I604",
    "I605", "I606", "I607", "I608", "I609", "I61", "I610",
    "I611", "I612", "I613", "I614", "I615", "I616", "I618",
    "I619", "I62", "I620", "I621", "I629", "I63", "I630",
    "I631", "I632", "I633", "I634", "I635", "I636", "I638",
    "I639", "I64", "I65", "I650", "I651", "I652", "I653", 
    "I658", "I659", "I66", "I660", "I661", "I662", "I663",
    "I664", "I668", "I669", "I67", "I670", "I671", "I672",
    "I673", "I674", "I675", "I676", "I677", "I678", "I679",
    "I68", "I680", "I681", "I682", "I688", "I69", "I690",
    "I691", "I692", "I693", "I694", "I698"
]
# from our manuscript
coronary_artery_disease_icd10 = ["I20", "I21", "I22", "I23", "I24", "I25"]
# from bud
peripheral_arterial_disease_icd10 = ["I702", "I73"]
abdominal_aortic_aneurysm_icd10 = ["I71"]
    
# from the icd10_diagnoses list
smart_df["prior_cerebrovascular_disease"] = smart_df["icd10_diagnoses"].apply(
    lambda x: match_codes(cerebrovascular_disease_icd10, x)
    )
smart_df["prior_coronary_artery_disease"] = smart_df["icd10_diagnoses"].apply(
    lambda x: match_codes(coronary_artery_disease_icd10, x)
    )
smart_df["prior_peripheral_arterial_disease"] = smart_df["icd10_diagnoses"].apply(
    lambda x: match_codes(peripheral_arterial_disease_icd10, x)
    )
smart_df["prior_abdominal_aortic_aneurysm"] = smart_df["icd10_diagnoses"].apply(
    lambda x: match_codes(abdominal_aortic_aneurysm_icd10, x)
    )

# Implementation

We used the following resources:
- The coefficients from the [SMART manuscript](https://heart.bmj.com/content/99/12/866.long), Table 2.

In [None]:
# Model A, SMART risk score
#  Age in years	−0.0850	62.8*	<0.01*	1.86 (1.59 to 2.19)*
#  Age in years squared	0.0011			
#  Male sex	0.1561	2.2	0.14	1.19 (0.94 to 1.51)
#  Diabetes mellitus	0.2232	5.3	0.02	1.30 (1.05 to 1.63)
#  Current smoking	0.2617	7.6	<0.01	1.33 (1.09 to 1.62)
#  Systolic blood pressure (per 10 mm Hg)	0.0043	4.1	0.04	1.04 (1.00 to 1.09)
#  Total cholesterol (mmol/l)	0.0959	5.6	0.02	1.11 (1.02 to 1.20)
#  HDL-cholesterol (mmol/l)	−0.4256	9.8	<0.01	0.63 (0.47 to 0.85)
#  hs-CRP (mg/dl) log transformed	0.1394	9.8	<0.01	1.24 (1.08 to 1.41)*
#  eGFR (ml/min/1.73 m²)	−0.0532	21.0*	<0.01	0.87 (0.76 to 0.98)*
#  eGFR squared	0.0003			
#  Years since first vascular event	0.0229	7.4	<0.01	1.02 (1.01 to 1.03)
#  History of cerebrovascular disease	0.4058	17.4	<0.01	1.65 (1.31 to 2.08)
#  History of coronary artery disease	0.1401	3.9	0.05	1.28 (1.00 to 1.62)
#  History of abdominal aortic aneurysm	0.5578	21.2	<0.01	1.93 (1.48 to 2.51)
#  History of peripheral arterial disease	0.2832	9.0	<0.01	1.44 (1.14 to 1.81)

smart  = -0.0850 * smart_df["age"]
smart +=  0.0011 * smart_df["age"] ** 2
smart +=  0.1561 * smart_df["sex"]
smart +=  0.2232 * smart_df["diabetes"]
smart +=  0.2617 * smart_df["smoking"]
smart +=  0.0043 * smart_df["systolic_blood_pressure"] / 10
smart +=  0.0959 * smart_df["tc"]
smart += -0.4256 * smart_df["hdlc"]
smart +=  0.1394 * np.log(smart_df["crp"] / 10)
smart += -0.0532 * smart_df["egfr_creat_cys"]
smart +=  0.0003 * smart_df["egfr_creat_cys"] ** 2
smart +=  0.0229 * smart_df["years_since_first_event"]
smart +=  0.4058 * smart_df["prior_cerebrovascular_disease"]
smart +=  0.1401 * smart_df["prior_coronary_artery_disease"]
smart +=  0.5578 * smart_df["prior_abdominal_aortic_aneurysm"]
smart +=  0.2832 * smart_df["prior_peripheral_arterial_disease"]

In [None]:
smart_df["y_score"] = (1 - 0.81066 ** np.exp(smart + 2.099)) * 100

# remove those with no prior arterial atherosclerosis
excluded = smart_df[
    smart_df["prior_cerebrovascular_disease"] |
    smart_df["prior_coronary_artery_disease"] |
    smart_df["prior_peripheral_arterial_disease"] |
    smart_df["prior_abdominal_aortic_aneurysm"]
]

smart_df.loc[excluded.index, "y_score"] = np.nan

score = smart_df[['sex', 'y_score']].copy()

# QC

In [None]:
# compute auc
from sklearn.metrics import roc_auc_score

y_true = df.event_10yr_label.values.astype(bool)
y_score = score['y_score'].values

mask = ~np.isnan(y_true) & ~np.isnan(y_score)
y_true = y_true[mask]
y_score = y_score[mask]

roc_auc_score(y_true, y_score)

# Save

In [None]:
# compute top quantile by sex as threshold
thresholds = score[['sex', 'y_score']].groupby("sex").aggregate(percentile(0.95))
thresholds = thresholds.reset_index()
thresholds.columns = ["sex", "threshold"]

score = score.reset_index().merge(thresholds, on="sex", how="left").set_index('IID')
score["y_pred"] = (score["y_score"] > score["threshold"]).astype(int)

if not TEST_EXAMPLE:
    save_results(score[['y_score', 'y_pred']], "results/smart")