In [None]:
%load_ext autoreload
%autoreload 2
%aimport -raw_data_preprocessing -pandas -numpy

In [None]:
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

sys.path.append("../")
sys.path.append("../ukbb_preprocessing/")

from raw_data_preprocessing.raw_data_loader import raw_data_loader
from raw_data_preprocessing.constants import *
from utils import rename_variables, DataRegisterer, percentile, save_results

loader = raw_data_loader()
registerer = DataRegisterer()

data_asset = loader.ws.data.get(name="clin_ascvd", version="6")

In [None]:
df = pd.read_csv(data_asset.path)
df.index = df.IID.astype(int)
score = rename_variables(df)
t2d = loader._load_t2d()

score = score.merge(t2d, on='IID', how='left')
score['t2d'] = score['t2d'].astype(bool)
score['t2d_date'] = pd.to_datetime(score['t2d_date'], format='%Y-%m-%d')

In [None]:
# # test example from the paper
# score = pd.DataFrame({
#     'IID': [1, 2, 3, 4],
#     'sr-ethnicity': [1.0, 5.0, 1.0, 5.0],
#     'sex': [0, 0, 1, 1],
#     'age': [55, 55, 55, 55],
#     'tc': [213 / 38.67, 213 / 38.67, 213 / 38.67, 213 / 38.67],
#     'hdlc': [50 / 38.67, 50 / 38.67, 50 / 38.67, 50 / 38.67],
#     'cvd_meds': ['0', '0', '0', '0'],
#     'systolic_blood_pressure': [120, 120, 120, 120],
#     'smoking': [0, 0, 0, 0],
#     'diabetes': [0, 0, 0, 0],
#     't2d': [False, False, False, False],
#     't2d_date': [pd.NaT, pd.NaT, pd.NaT, pd.NaT],
#     'study_date': [pd.to_datetime('2021-01-01'), pd.to_datetime('2021-01-01'), pd.to_datetime('2021-01-01'), pd.to_datetime('2021-01-01')]
# })
# # expected: 0.053, NaN, 0.061, NaN

In [None]:
# convert mmol / L to mg / dL
score['tc'] = score['tc'] * 38.67
score['hdlc'] = score['hdlc'] * 38.67

score["current_smoker"] = score["smoking"] == 2
score["prior_diabetes"] = score["diabetes"] == 1 | (score["t2d"] & (score["t2d_date"].isna() | (score["t2d_date"] < score["study_date"])))

# from Table A, https://www.ahajournals.org/doi/pdf/10.1161/01.cir.0000437741.48606.98
# this is the model for white people
# male white
pce_mw_age                   =  12.344*np.log(score['age'])
pce_mw_tc                    =  11.853*np.log(score['tc'])
pce_mw_age_tc                = - 2.664*np.log(score['age'])*np.log(score['tc'])
pce_mw_hdlc                  = - 7.990*np.log(score['hdlc'])
pce_mw_age_hdlc              =   1.769*np.log(score['age'])*np.log(score['hdlc'])
pce_mw_sbp                   = np.where(score['cvd_meds'].str.contains('2', regex=False), 
                                       1.797*np.log(score['systolic_blood_pressure']), 
                                       1.764*np.log(score['systolic_blood_pressure']))
pce_mw_current_smoker        =   7.837*score['current_smoker']
pce_mw_age_current_smoker    = - 1.795*np.log(score['age'])*score['current_smoker']
pce_mw_prior_diabetes        =   0.658*score['prior_diabetes']

pce_mw = 0

for term in [pce_mw_age, pce_mw_tc, pce_mw_age_tc, pce_mw_hdlc, pce_mw_age_hdlc, pce_mw_current_smoker,
             pce_mw_age_current_smoker, pce_mw_prior_diabetes, pce_mw_sbp]:
    term = np.nan_to_num(term, 0)
    pce_mw += term

pce_mw = 1 - 0.9144 ** np.exp(pce_mw - 61.18)

# female white
pce_fw_age                  = -29.799*np.log(score['age'])
pce_fw_age2                 =   4.884*np.log(score['age'])**2
pce_fw_tc                   =  13.540*np.log(score['tc'])
pce_fw_age_tc               = - 3.114*np.log(score['age'])*np.log(score['tc'])
pce_fw_hdlc                 = -13.578*np.log(score['hdlc'])
pce_fw_age_hdlc             =   3.149*np.log(score['age'])*np.log(score['hdlc'])
pce_fw_sbp                  = np.where(score['cvd_meds'].str.contains('2', regex=False),
                                      2.019*np.log(score['systolic_blood_pressure']),
                                      1.957*np.log(score['systolic_blood_pressure']))
pce_fw_current_smoker       =   7.574*score['current_smoker']
pce_fw_age_current_smoker   = - 1.665*np.log(score['age'])*score['current_smoker']
pce_fw_prior_diabetes       =   0.661*score['prior_diabetes']

pce_fw = 0

for term in [pce_fw_age, pce_fw_tc, pce_fw_age_tc, pce_fw_hdlc, pce_fw_age_hdlc, pce_fw_current_smoker,
                pce_fw_age_current_smoker, pce_fw_prior_diabetes, pce_fw_age2, pce_fw_sbp]:
    term = np.nan_to_num(term, 0)
    pce_fw += term

pce_fw = 1 - 0.9665 ** np.exp(pce_fw + 29.18)

score['y_score'] = np.where(score['sex'] == 1, pce_mw, pce_fw)
score.loc[~score["sr-ethnicity"].isin([1.0, 1001, 1002, 1003]), 'y_score'] = np.nan

# compute top quantile by sex as threshold
thresholds = score[['sex', 'y_score']].groupby("sex").aggregate(percentile(0.95))
thresholds = thresholds.reset_index()
thresholds.columns = ["sex", "threshold"]

score = score.reset_index().merge(thresholds, on="sex", how="left").set_index('IID')
score["y_pred"] = (score["y_score"] > score["threshold"]).astype(int)

score = score[['y_score', 'y_pred']]
save_results(score, "results/pce")