In [None]:
%load_ext autoreload
%autoreload 2
%aimport -raw_data_preprocessing -pandas -numpy

In [None]:
import sys

import pandas as pd
import numpy as np

sys.path.append("../")
sys.path.append("../ukbb_preprocessing/")

from raw_data_preprocessing.raw_data_loader import raw_data_loader
from raw_data_preprocessing.constants import *
from utils import rename_variables, DataRegisterer, percentile, save_results

loader = raw_data_loader()
registerer = DataRegisterer()

data_asset = loader.ws.data.get(name="clin_ascvd", version="6")

In [None]:
# df = pd.read_csv(data_asset.path)
# prevent_df = rename_variables(df)
# df.set_index('IID', inplace=True)

In [None]:
# Example from the article, Table S25
prevent_df = pd.DataFrame({
    "IID": [1],
    "sex": 0,
    "age": 50,
    "tc": 200 * 0.02586,
    "hdlc": 45 * 0.02586,
    "systolic_blood_pressure": 160,
    "diabetes": 1,
    "smoking": 0,
    "egfr_creat_cys": 90,
    "medications": [["1140860332"]], # random antihyperintensive
    "hbA1c": np.nan
})
prevent_df.set_index('IID', inplace=True)

df = pd.DataFrame({
    "IID": [1],
    "ascvd_10yr_label": [0],
})
df.set_index('IID', inplace=True)
# Expected ; Obtained: 10.35%. 
# Probably ok: we use the model with hbA1c and the term-wise comparison produces similar results

In [None]:
antihypertensives = [
    1140860332, 1140860334, 1140860336, 1140860338, 1140860340, 1140860342,
    1140860348, 1140860352, 1140860356, 1140860358, 1140860362, 1140860380,
    1140860382, 1140860386, 1140860390, 1140860394, 1140860396, 1140860398,
    1140860402, 1140860404, 1140860406, 1140860410, 1140860418, 1140860422,
    1140860426, 1140860434, 1140860454, 1140860470, 1140860478, 1140860492,
    1140860498, 1140860520, 1140860532, 1140860534, 1140860544, 1140860552,
    1140860558, 1140860562, 1140860564, 1140860580, 1140860590, 1140860610,
    1140860628, 1140860632, 1140860638, 1140860654, 1140860658, 1140860690,
    1140860696, 1140860706, 1140860714, 1140860728, 1140860736, 1140860738,
    1140860750, 1140860752, 1140860758, 1140860764, 1140860776, 1140860784,
    1140860790, 1140860802, 1140860806, 1140860828, 1140860830, 1140860834,
    1140860836, 1140860838, 1140860840, 1140860842, 1140860846, 1140860848,
    1140860862, 1140860878, 1140860882, 1140860892, 1140860904, 1140860912,
    1140860918, 1140860938, 1140860942, 1140860952, 1140860954, 1140860966,
    1140860972, 1140860976, 1140860982, 1140860988, 1140860994, 1140861000,
    1140861002, 1140861008, 1140861010, 1140861016, 1140861022, 1140861024,
    1140861034, 1140861046, 1140861068, 1140861070, 1140861088, 1140861090,
    1140861106, 1140861110, 1140861114, 1140861120, 1140861128, 1140861130,
    1140861136, 1140861138, 1140861166, 1140861176, 1140861190, 1140861194,
    1140861202, 1140861266, 1140861268, 1140861276, 1140861282, 1140861326,
    1140861384, 1140864950, 1140864952, 1140866072, 1140866074, 1140866078,
    1140866084, 1140866086, 1140866090, 1140866092, 1140866094, 1140866096,
    1140866102, 1140866104, 1140866108, 1140866110, 1140866116, 1140866122,
    1140866128, 1140866132, 1140866136, 1140866138, 1140866140, 1140866144,
    1140866146, 1140866156, 1140866158, 1140866162, 1140866164, 1140866168,
    1140866182, 1140866192, 1140866194, 1140866200, 1140866202, 1140866206,
    1140866210, 1140866212, 1140866220, 1140866222, 1140866226, 1140866230,
    1140866232, 1140866236, 1140866244, 1140866248, 1140866262, 1140866280,
    1140866282, 1140866306, 1140866308, 1140866312, 1140866318, 1140866324,
    1140866328, 1140866330, 1140866332, 1140866334, 1140866340, 1140866352,
    1140866354, 1140866356, 1140866360, 1140866388, 1140866390, 1140866396,
    1140866400, 1140866402, 1140866404, 1140866406, 1140866408, 1140866410,
    1140866412, 1140866416, 1140866418, 1140866420, 1140866422, 1140866426,
    1140866438, 1140866440, 1140866442, 1140866444, 1140866446, 1140866448,
    1140866450, 1140866460, 1140866466, 1140866484, 1140866506, 1140866546,
    1140866554, 1140866692, 1140866704, 1140866712, 1140866724, 1140866726,
    1140866738, 1140866756, 1140866758, 1140866764, 1140866766, 1140866778,
    1140866782, 1140866784, 1140866798, 1140866800, 1140866802, 1140866804,
    1140875808, 1140879758, 1140879760, 1140879762, 1140879778, 1140879782,
    1140879786, 1140879794, 1140879798, 1140879802, 1140879806, 1140879810,
    1140879818, 1140879822, 1140879824, 1140879826, 1140879830, 1140879834,
    1140879842, 1140879854, 1140879866, 1140888510, 1140888512, 1140888552,
    1140888556, 1140888560, 1140888578, 1140888582, 1140888586, 1140888646,
    1140888686, 1140888760, 1140888762, 1140909368, 1140911698, 1140916356,
    1140916362, 1140917428, 1140923572, 1140923712, 1140923718, 1140926778,
    1140926780, 1141145658, 1141145660, 1141145668, 1141151016, 1141151018,
    1141151382, 1141152600, 1141152998, 1141153006, 1141153026, 1141153032,
    1141153328, 1141156754, 1141156808, 1141156836, 1141156846, 1141157252,
    1141157254, 1141164148, 1141164154, 1141164276, 1141164280, 1141165470,
    1141165476, 1141166006, 1141167822, 1141167832, 1141171152, 1141171336,
    1141171344, 1141172682, 1141172686, 1141172698, 1141173888, 1141180592,
    1141180598, 1141187788, 1141187790, 1141190160, 1141192064, 1141193282,
    1141193346, 1141194794, 1141194800, 1141194804, 1141194808, 1141194810,
    1141201038, 1141201040]
statins = [1141146234, 1140888594, 1140888648, 1141192410, 1140861958]

def binarize_column(values, ori_colname, colname):
    prevent_df[colname] = 0

    mask = np.repeat(False, len(prevent_df))

    for v in values:
        # curr_mask = prevent_df[ori_colname].str.contains(v, regex=False)
        curr_mask = [(str(v) in x) for x in prevent_df[ori_colname].values]
        curr_mask = np.array(curr_mask)
        mask = mask | curr_mask

    prevent_df.loc[mask, colname] = 1

binarize_column(antihypertensives, 'medications', 'b_treatedhyp')
binarize_column(statins, 'medications', 'statin')

# normalize
prevent_df["n_age"] = (prevent_df["age"] - 55) / 10
prevent_df["n_hdlc"] = (prevent_df["hdlc"] - 1.3) / 0.3
prevent_df["n_non-hdlc"] = (prevent_df["tc"] - prevent_df["hdlc"]) - 3.5
prevent_df["n_sbp"] = np.where(prevent_df["systolic_blood_pressure"] < 110,
                               (prevent_df["systolic_blood_pressure"] - 110) / 20,
                               (prevent_df["systolic_blood_pressure"] - 130) / 20)
prevent_df["n_egfr"] = np.where(prevent_df["egfr_creat_cys"] < 60,
                                (prevent_df["egfr_creat_cys"] - 60) / -15,
                                (prevent_df["egfr_creat_cys"] - 90) / -15)
prevent_df["hba1c_perc"] = (prevent_df["hbA1c"] / 10.929 + 2.15)
prevent_df["n_hba1c"] = prevent_df["hba1c_perc"] - 5.3
prevent_df["age_x_sbp"] = np.where(prevent_df["systolic_blood_pressure"] >= 110,
                                   prevent_df["n_age"] * prevent_df["n_sbp"],
                                   0)
prevent_df["age_x_egfr"] = np.where(prevent_df["egfr_creat_cys"] < 60,
                                    prevent_df["n_age"] * prevent_df["n_egfr"],
                                    0)



# Implementation

We used the following resources:
- The coefficients from the [PREVENT manuscript](https://www.ahajournals.org/doi/10.1161/CIRCULATIONAHA.123.067626), Table S12C.
- [preventr](https://github.com/martingmayer/preventr/blob/main/R/prevent_equations.R)'s implementation

## Women

In [None]:
# Age, per 10 years: 0.7858178
prevent_w = 0.7858178 * prevent_df["n_age"]

# non-HDL-C per 1 mmol/L: 0.0194438
prevent_w += 0.0194438 * prevent_df["n_non-hdlc"]

# HDL-C per 0.3 mmol/L: -0.1521964
prevent_w += -0.1521964 * prevent_df["n_hdlc"]

# SBP <110 per 20 mmHg: -0.2296681
# SBP ≥110 per 20 mmHg	0.3465777
prevent_w += np.where(prevent_df["systolic_blood_pressure"] < 110,
                      -0.2296681 * prevent_df["n_sbp"],
                      0.3465777 * prevent_df["n_sbp"])

# Diabetes: 0.5366241
prevent_w += 0.5366241 * prevent_df["diabetes"]

# Current smoking: 0.5411682
prevent_w += 0.5411682 * prevent_df["smoking"]

# eGFR <60, per -15 ml: 0.5931898
# eGFR 60+, per -15 ml: 0.0472458
prevent_w += np.where(prevent_df["egfr_creat_cys"] < 60,
                      0.5931898 * prevent_df["n_egfr"],
                      0.0472458 * prevent_df["n_egfr"])

# Anti-hypertensive use: 0.3158567
prevent_w += 0.3158567 * prevent_df["b_treatedhyp"]

# Statin use: -0.1535174
prevent_w += -0.1535174 * prevent_df["statin"]

# Treated SBP ≥110 mm Hg per 20 mm Hg: -0.0687752
prevent_w += np.where(prevent_df["systolic_blood_pressure"] >= 110 & prevent_df["b_treatedhyp"],
                     -0.0687752 * prevent_df["n_sbp"],
                     prevent_w)

# Treated non-HDL-C: 0.1054746
prevent_w += 0.1054746 * prevent_df["n_non-hdlc"] * prevent_df["statin"]

# Age per 10yr * non-HDL-C per 1 mmol/L: -0.0761119
prevent_w += -0.0761119 * prevent_df["n_age"] * prevent_df["n_non-hdlc"]

# Age per 10yr * HDL-C per 1 mml/L: 0.0307469
prevent_w += 0.0307469 * prevent_df["n_age"] * prevent_df["n_hdlc"]

# Age per 10yr * SBP ≥110 mm Hg per 20 mmHg: -0.0905966
prevent_w += -0.0905966 * prevent_df["age_x_sbp"]

# Age per 10yr * diabetes: -0.2241857
prevent_w += -0.2241857 * prevent_df["n_age"] * prevent_df["diabetes"]

# Age per 10yr * current smoking: -0.080186
prevent_w += -0.080186 * prevent_df["n_age"] * prevent_df["smoking"]

# Age per 10yr * eGFR <60, per -15 ml: -0.1667286
prevent_w += -0.1667286 * prevent_df["age_x_egfr"]

# HbA1c in DM, per 1%: 0.1338348
# HbA1c no DM, per 1%: 0.1412555
# Missing HbA1c: -0.0142496
prevent_w += np.where(prevent_df["hbA1c"].isna(),
                      -0.0142496 * prevent_df["hbA1c"].isna(),
                      np.where(prevent_df["diabetes"],
                               0.1338348 * prevent_df["n_hba1c"],
                               0.1412555 * prevent_df["n_hba1c"]))

# Constant: -3.306162
prevent_w += -3.306162

## Men

In [None]:
# Age, per 10 years: 0.7699177
prevent_m = 0.7699177 * prevent_df["n_age"]

# non-HDL-C per 1 mmol/L: 0.0605093
prevent_m += 0.0605093 * prevent_df["n_non-hdlc"]

# HDL-C per 0.3 mmol/L: -0.0888525
prevent_m += -0.0888525 * prevent_df["n_hdlc"]

# SBP <110 per 20 mmHg: -0.417713
# SBP ≥110 per 20 mmHg: 0.3288657
prevent_m += np.where(prevent_df["systolic_blood_pressure"] < 110,
                      -0.417713 * prevent_df["n_sbp"],
                      0.3288657 * prevent_df["n_sbp"])

# Diabetes: 0.4759471
prevent_m += 0.4759471 * prevent_df["diabetes"]

# Current smoking: 0.4385663
prevent_m += 0.4385663 * prevent_df["smoking"]

# eGFR <60, per -15 ml: 0.5334616
# eGFR 60+, per -15 ml: 0.0206431
prevent_m += np.where(prevent_df["egfr_creat_cys"] < 60,
                        0.5334616 * prevent_df["n_egfr"],
                        0.0206431 * prevent_df["n_egfr"])

# Anti-hypertensive use: 0.2917524
prevent_m += 0.2917524 * prevent_df["b_treatedhyp"]

# Statin use: -0.1383313
prevent_m += -0.1383313 * prevent_df["statin"]

# Treated SBP ≥110 mm Hg per 20 mm Hg: -0.0482622
prevent_m += np.where(prevent_df["systolic_blood_pressure"] >= 110 & prevent_df["b_treatedhyp"],
                     -0.0482622 * prevent_df["n_sbp"],
                     prevent_m)

# Treated non-HDL-C: 0.1393796
prevent_m += 0.1393796 * prevent_df["n_non-hdlc"] * prevent_df["statin"]

# Age per 10yr * non-HDL-C per 1 mmol/L: -0.0463501
prevent_m += -0.0463501 * prevent_df["n_age"] * prevent_df["n_non-hdlc"]

# Age per 10yr * HDL-C per 1 mml/L: 0.0205926
prevent_m += 0.0205926 * prevent_df["n_age"] * prevent_df["n_hdlc"]

# Age per 10yr * SBP ≥110 mm Hg per 20 mmHg: -0.1037717
prevent_m += -0.1037717 * prevent_df["age_x_sbp"]

# Age per 10yr * diabetes: -0.1737697
prevent_m += -0.1737697 * prevent_df["n_age"] * prevent_df["diabetes"]

# Age per 10yr * current smoking: -0.0915839
prevent_m += -0.0915839 * prevent_df["n_age"] * prevent_df["smoking"]

# Age per 10yr * eGFR <60, per -15 ml: -0.1637039
prevent_m += -0.1637039 * prevent_df["age_x_egfr"]

# HbA1c in DM, per 1%: 0.13159
# HbA1c no DM, per 1%: 0.1295185
# Missing HbA1c: -0.0128373
prevent_m += np.where(prevent_df["hbA1c"].isna(),
                      -0.0128373 * prevent_df["hbA1c"].isna(),
                      np.where(prevent_df["diabetes"],
                               0.13159 * prevent_df["n_hba1c"],
                               0.1295185 * prevent_df["n_hba1c"]))

# Constant: -3.040901
prevent_m += -3.040901

In [None]:
prevent_df["y_score"] = np.where(prevent_df["sex"] == 0, prevent_w, prevent_m)
prevent_df["y_score"] = np.exp(prevent_df["y_score"]) / (1 + np.exp(prevent_df["y_score"]))

# remove score for those with a value outside of the range
excluded = prevent_df[
    (prevent_df["age"] < 30) | (prevent_df["age"] > 79) |
    (prevent_df["egfr_creat_cys"] < 15) | (prevent_df["egfr_creat_cys"] > 140) |
    (prevent_df["hdlc"] < 0.52) | (prevent_df["hdlc"] > 2.59) |
    (prevent_df["tc"] < 3.36) | (prevent_df["tc"] > 8.28) |
    (prevent_df["systolic_blood_pressure"] < 90) | (prevent_df["systolic_blood_pressure"] > 180) |
    (prevent_df["hba1c_perc"] < 4.5) | (prevent_df["hba1c_perc"] > 15)
]

prevent_df.loc[excluded.index, "y_score"] = np.nan

score = prevent_df[['sex', 'y_score']].copy()

# QC

In [None]:
# compute auc
from sklearn.metrics import roc_auc_score

y_true = df.ascvd_10yr_label.values
y_score = score['y_score'].values

mask = ~np.isnan(y_true) & ~np.isnan(y_score)
y_true = y_true[mask]
y_score = y_score[mask]

# 0.7210585037045294
roc_auc_score(y_true, y_score)

# Save

In [None]:
# compute top quantile by sex as threshold
thresholds = score[['sex', 'y_score']].groupby("sex").aggregate(percentile(0.95))
thresholds = thresholds.reset_index()
thresholds.columns = ["sex", "threshold"]

score = score.reset_index().merge(thresholds, on="sex", how="left").set_index('IID')
score["y_pred"] = (score["y_score"] > score["threshold"]).astype(int)

score = score[['y_score', 'y_pred']]
save_results(score, "results/prevent")