In [None]:
%load_ext autoreload
%autoreload 2
%aimport -raw_data_preprocessing -pandas -numpy

In [None]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sys.path.append("../")
sys.path.append("../ukbb_preprocessing/")

from raw_data_preprocessing.raw_data_loader import raw_data_loader
from raw_data_preprocessing.constants import *
from utils import rename_variables, DataRegisterer, percentile, save_results

TEST_EXAMPLE = False
loader = raw_data_loader()
registerer = DataRegisterer()

In [None]:
if TEST_EXAMPLE:
    # from the article https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8248998/
    # Supplementary Table 4
    score = pd.DataFrame(
        {
            "IID": [1, 2],
            "sex": [1, 0],
            "age": [50, 50],
            "smoking": [2, 2],
            "systolic_blood_pressure": [140, 140],
            "tc": [6.3, 6.3],
            "hdlc": [1.4, 1.4],
            "t2d": [0, 0],
            "diabetes": [0, 0],
            "t2d_date": [pd.NaT, pd.NaT],
            "study_date": [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-01")],
        })
    # expected output: 0.0631 0.0434
else:
    if os.path.exists("cache/df.pkl"):
        print("Loading cached df")
        df = pd.read_pickle("cache/df.pkl")
    else:
        data_asset = loader.ws.data.get(name="clin_ascvd", version="6")
        df = pd.read_csv(data_asset.path)
        df.index = df.IID.astype(int)
        df.to_pickle("cache/df.pkl")

    if os.path.exists("cache/processed_df.pkl"):
        print("Loading cached processed_df")
        score = pd.read_pickle("cache/processed_df.pkl")
    else:
        score = rename_variables(df)
        score.to_pickle("cache/processed_df.pkl")

    t2d = loader._load_t2d()
    score = score.merge(t2d, on="IID", how="left")
    score["t2d"] = score["t2d"].astype(bool)
    score["t2d_date"] = pd.to_datetime(score["t2d_date"], format="%Y-%m-%d")

In [None]:
score["current_smoker"] = score["smoking"] == 2
score["prior_diabetes"] = score["diabetes"] == 1 | (score["t2d"] & (score["t2d_date"].isna() | (score["t2d_date"] < score["study_date"])))
score["n_age"] = (score["age"] - 60) / 5
score["n_sbp"] = (score["systolic_blood_pressure"] - 120) / 20
score["n_tc"] = score["tc"] - 6
score["n_hdlc"] = (score["hdlc"] - 1.3) / 0.5

score["score2_f"] = 0.4648*score["n_age"] + \
    0.7744*score["current_smoker"] + \
    0.3131*score["n_sbp"] + \
    0.8096*score["prior_diabetes"] + \
    0.1002*score["n_tc"] + \
   -0.2606*score["n_hdlc"] + \
   -0.1088*score["n_age"]*score["current_smoker"] + \
   -0.0277*score["n_sbp"]*score["n_age"] + \
   -0.0226*score["n_tc"]*score["n_age"] + \
    0.0613*score["n_hdlc"]*score["n_age"] + \
   -0.1272*score["prior_diabetes"]*score["n_age"]
# uncalibrated risk
score["score2_f"] = 1 - 0.9605 ** np.exp(score["score2_f"])
# calibrated risk
score["score2_f"] = 1 - np.exp(-np.exp(-0.7380 + 0.7019 * np.log(-np.log(1 - score["score2_f"]))))

score["score2_m"] =  0.3742*score["n_age"] + \
    0.6012*score["current_smoker"] + \
    0.2777*score["n_sbp"] + \
    0.6457*score["prior_diabetes"] + \
    0.1458*score["n_tc"] + \
   -0.2698*score["n_hdlc"] + \
   -0.0755*score["n_age"]*score["current_smoker"] + \
   -0.0255*score["n_sbp"]*score["n_age"] + \
   -0.0281*score["n_tc"]*score["n_age"] + \
    0.0426*score["n_hdlc"]*score["n_age"] + \
   -0.0983*score["prior_diabetes"]*score["n_age"]
# uncalibrated risk
score["score2_m"] = 1 - 0.9776 ** np.exp(score["score2_m"])
# calibrated risk
score["score2_m"] = 1 - np.exp(-np.exp(-0.5699 + 0.7476 * np.log(-np.log(1 - score["score2_m"]))))

score["y_score"] = np.where(score["sex"] == 1, score["score2_m"], score["score2_f"])
score["y_pred"] = score["y_score"] > 0.5

if not TEST_EXAMPLE:
    save_results(score[['y_score', 'y_pred']], "results/score2")