In [None]:
%load_ext autoreload
%autoreload 2
%aimport -raw_data_preprocessing -pandas -numpy

In [None]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sys.path.append("../")
sys.path.append("../ukbb_preprocessing/")

from raw_data_preprocessing.raw_data_loader import raw_data_loader
from raw_data_preprocessing.constants import *
from utils import rename_variables, DataRegisterer, percentile

loader = raw_data_loader()
registerer = DataRegisterer()

data_asset = loader.ws.data.get(name="clin_ascvd", version="6")
df = pd.read_csv(data_asset.path)

In [None]:
score = rename_variables(df)

t2d = loader._load_t2d()
score = score.merge(t2d, on="IID", how="left")
score["t2d"] = score["t2d"].astype(bool)
score["t2d_date"] = pd.to_datetime(score["t2d_date"], format="%Y-%m-%d")
score["current_smoker"] = score["smoking"] == 2
score["prior_diabetes"] = score["diabetes"] == 1 | (score["t2d"] & (score["t2d_date"].isna() | (score["t2d_date"] < score["study_date"])))
score["t_age"] = (score["age"] - 60)/5
score["t_sbp"] = (score["systolic_blood_pressure"] - 120)/20

score["score2_m"] = 0.4648*score["t_age"] + \
    0.7744*score["current_smoker"] + \
    0.3131*score["t_sbp"] + \
    0.8096*score["prior_diabetes"] + \
    0.1002*score["tc"] + \
   -0.2606*score["hdlc"] + \
   -0.1088*score["t_age"]*score["current_smoker"] + \
   -0.0277*score["t_sbp"]*score["t_age"] + \
   -0.0226*score["tc"]*score["t_age"] + \
    0.0613*score["hdlc"]*score["t_age"] + \
   -0.1272*score["prior_diabetes"]*score["t_age"]

score["score2_f"] =  0.3742*score["t_age"] + \
    0.6012*score["current_smoker"] + \
    0.2777*score["t_sbp"] + \
    0.6457*score["prior_diabetes"] + \
    0.1458*score["tc"] + \
   -0.2698*score["hdlc"] + \
   -0.0755*score["t_age"]*score["current_smoker"] + \
   -0.0255*score["t_sbp"]*score["t_age"] + \
   -0.0281*score["tc"]*score["t_age"] + \
    0.0426*score["hdlc"]*score["t_age"] + \
   -0.0983*score["prior_diabetes"]*score["t_age"]

score["y_score"] = np.where(score["sex"] == 1, score["score2_m"], score["score2_f"])

# compute top quantile by sex as threshold
thresholds = score[['sex', 'y_score']].groupby("sex").aggregate(percentile(0.95))
thresholds = thresholds.reset_index()
thresholds.columns = ["sex", "threshold"]

score = score.reset_index().merge(thresholds, on="sex", how="left").set_index('IID')
score["y_pred"] = score["y_score"] > score["threshold"]

save_results(score, "results/score2")

In [None]:
old_scores = pd.read_csv('clinical_scores.tsv', sep='\t')
old_scores['IID'] = old_scores['eid']
old_scores = old_scores.set_index('IID')
merged = old_scores.merge(score, on='IID', how='left', suffixes=('_old', '_new'))

plt.scatter(merged['score2'], merged['y_score'], c=merged["y_pred"])
plt.xlabel('old')
plt.ylabel('new')
plt.show()

assert merged['score2'].corr(merged['y_score']) > 0.99