In [None]:
%load_ext autoreload
%autoreload 2
%aimport -raw_data_preprocessing -pandas -numpy

In [None]:
import sys

import pandas as pd
import numpy as np

sys.path.append("../")
sys.path.append("../ukbb_preprocessing/")

from raw_data_preprocessing.raw_data_loader import raw_data_loader
from raw_data_preprocessing.constants import *
from utils import rename_variables, DataRegisterer, percentile, save_results

loader = raw_data_loader()
registerer = DataRegisterer()

data_asset = loader.ws.data.get(name="clin_ascvd", version="6")

# Read data and preprocessing

In [None]:
df = pd.read_csv(data_asset.path)
qrisk_df = rename_variables(df)
df.set_index('IID', inplace=True)

# compute HR variability
qrisk_df["sbp_0_deviation"] = df["p4080_i0_a0"] - qrisk_df["systolic_blood_pressure"]
qrisk_df["sbp_1_deviation"] = df["p4080_i0_a1"] - qrisk_df["systolic_blood_pressure"]
qrisk_df["sbp_0_sq_deviation"] = qrisk_df["sbp_0_deviation"]**2
qrisk_df["sbp_1_sq_deviation"] = qrisk_df["sbp_1_deviation"]**2
qrisk_df["sbp_mean_deviation"] = (qrisk_df["sbp_0_sq_deviation"] + qrisk_df["sbp_1_sq_deviation"])/2
qrisk_df["sd_sbp"] = np.sqrt(qrisk_df["sbp_mean_deviation"])

qrisk_df.drop(["sbp_0_deviation", "sbp_1_deviation", "sbp_0_sq_deviation", "sbp_1_sq_deviation", "sbp_mean_deviation"], axis=1, inplace=True)

In [None]:
# # test example from the article
# # Table 6 at https://www.bmj.com/content/357/bmj.j2099
# qrisk_df = pd.DataFrame({
#     'IID': [1, 2, 3, 4, 5, 6],
#     # Sex	Male	Male	Male	Female	Female	Male
#     'sex': [1, 1, 1, 0, 0, 1],
#     # Age (years)	44	45	48	55	61	48
#     'age': [44, 45, 48, 55, 61, 48],
#     # Body mass index	27.2	22.4	29.7	24.9	33.7	30
#     'bmi': [27.2, 22.4, 29.7, 24.9, 33.7, 30],
#     # Total cholesterol: HDL cholesterol ratio	6.1	6.3	5	3.2	4.8	4.2
#     'rati': [6.1, 6.3, 5, 3.2, 4.8, 4.2],
#     # Systolic blood pressure	130	115	124	130	155	140
#     'systolic_blood_pressure': [130, 115, 124, 130, 155, 140],
#     # Family history of coronary heart disease	No	No	No	Yes	No	No
#     'sibling_illnesses': ['0', '0', '0', '1', '0', '0'],
#     'mother_illnesses': ['0', '0', '0', '0', '0', '0'],
#     'father_illnesses': ['0', '0', '0', '0', '0', '0'],
#     # Treated hypertension	No 	No 	No 	No 	No 	Yes
#     # Corticosteroid use	No 	No 	Yes	No 	No 	No
#     # Atypical antipsychotic use	No 	No 	No 	No 	Yes 	No
#     # Erectile dysfunction or treatment	No 	Yes	No 	NA	NA	No
#     'medications': [[], [1140869100], [1140853854], [], [1140867420], [1140860332]],
#     # Type 1 diabetes	No	No 	No 	No 	No 	No
#     # Type 2 diabetes	No 	Yes	No 	No 	No 	No
#     # Rheumatoid arthritis	No 	No 	No 	No 	No 	No
#     # Atrial fibrillation	No 	No 	No 	No 	No 	No
#     # Chronic kidney disease (stage 3, 4, or 5)	No 	No 	No 	No 	No 	No
#     # Migraine	Yes	No 	Yes	No 	No 	Yes
#     # Severe mental illness	No 	No 	No 	No 	Yes 	No
#     # Systemic lupus erythematosus	No 	No 	No 	No 	No 	No
#     'icd10_diagnoses': [["G43"], ["E11"], ["G43"], [], ["F20"], ["G43"]],
#     'icd9_diagnoses': [[], [], [], [], [], []],
#     # Standard deviation of systolic blood pressure	6	40	3.1	22	33	No
#     'sd_sbp': [6, 40, 3.1, 22, 33, np.nan],
#     # fields not present in the example
#     'tdi': [0, 0, 0, 0, 0, 0],
# })
# qrisk_df.set_index('IID', inplace=True)

# df = pd.DataFrame({
#     'IID': [1, 2, 3, 4, 5, 6],
#     # Ethnic origin	White	White	White	White	Black African	White
#     'p21000_i0': [1001, 1001, 1001, 1001, 2002, 1001],
#     # Smoking status	Heavy smoker	Non-smoker	Light smoker	Moderate smoker	Former smoker	Non-smoker
#     'p20116_i0': [2, 0, 2, 2, 1, 0],
#     'p3456_i0': [40, 0, 5, 15, 1, 0], # random values within the category
#     # random outcome
#     'ascvd_10yr_label': [1, 0, 0, 1, 0, 0],
# })
# df.set_index('IID', inplace=True)
# # Expected:
# # Model A 10 year predicted risk	9.2	8.3	6.4	11	9.4	9.2
# # Model B 10 year predicted risk	11	9.9	11	10	13	11
# # Model C 10 year predicted risk	11	13	11	11	15	9.5

In [None]:
# smoke_cat
# 0 "Never"
# 1 "Former"
# 2 "Light 1-9/day"
# 3 "Moderate 10-19/day"
# 4 "Heavy >20/day"
qrisk_df["smoke_cat"] = np.nan
qrisk_df.loc[df["p20116_i0"] == 0, "smoke_cat"] = 0
qrisk_df.loc[(df["p20116_i0"] == 1) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 1
qrisk_df.loc[(df["p3456_i0"] >= -10) & (df["p3456_i0"] <= 9) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 2
qrisk_df.loc[(df["p3456_i0"] >= 10) & (df["p3456_i0"] <= 19) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 3
qrisk_df.loc[(df["p3456_i0"] >= 20) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 4

In [None]:
# medications
antihypertensives = [
    1140860332, 1140860334, 1140860336, 1140860338, 1140860340, 1140860342,
    1140860348, 1140860352, 1140860356, 1140860358, 1140860362, 1140860380,
    1140860382, 1140860386, 1140860390, 1140860394, 1140860396, 1140860398,
    1140860402, 1140860404, 1140860406, 1140860410, 1140860418, 1140860422,
    1140860426, 1140860434, 1140860454, 1140860470, 1140860478, 1140860492,
    1140860498, 1140860520, 1140860532, 1140860534, 1140860544, 1140860552,
    1140860558, 1140860562, 1140860564, 1140860580, 1140860590, 1140860610,
    1140860628, 1140860632, 1140860638, 1140860654, 1140860658, 1140860690,
    1140860696, 1140860706, 1140860714, 1140860728, 1140860736, 1140860738,
    1140860750, 1140860752, 1140860758, 1140860764, 1140860776, 1140860784,
    1140860790, 1140860802, 1140860806, 1140860828, 1140860830, 1140860834,
    1140860836, 1140860838, 1140860840, 1140860842, 1140860846, 1140860848,
    1140860862, 1140860878, 1140860882, 1140860892, 1140860904, 1140860912,
    1140860918, 1140860938, 1140860942, 1140860952, 1140860954, 1140860966,
    1140860972, 1140860976, 1140860982, 1140860988, 1140860994, 1140861000,
    1140861002, 1140861008, 1140861010, 1140861016, 1140861022, 1140861024,
    1140861034, 1140861046, 1140861068, 1140861070, 1140861088, 1140861090,
    1140861106, 1140861110, 1140861114, 1140861120, 1140861128, 1140861130,
    1140861136, 1140861138, 1140861166, 1140861176, 1140861190, 1140861194,
    1140861202, 1140861266, 1140861268, 1140861276, 1140861282, 1140861326,
    1140861384, 1140864950, 1140864952, 1140866072, 1140866074, 1140866078,
    1140866084, 1140866086, 1140866090, 1140866092, 1140866094, 1140866096,
    1140866102, 1140866104, 1140866108, 1140866110, 1140866116, 1140866122,
    1140866128, 1140866132, 1140866136, 1140866138, 1140866140, 1140866144,
    1140866146, 1140866156, 1140866158, 1140866162, 1140866164, 1140866168,
    1140866182, 1140866192, 1140866194, 1140866200, 1140866202, 1140866206,
    1140866210, 1140866212, 1140866220, 1140866222, 1140866226, 1140866230,
    1140866232, 1140866236, 1140866244, 1140866248, 1140866262, 1140866280,
    1140866282, 1140866306, 1140866308, 1140866312, 1140866318, 1140866324,
    1140866328, 1140866330, 1140866332, 1140866334, 1140866340, 1140866352,
    1140866354, 1140866356, 1140866360, 1140866388, 1140866390, 1140866396,
    1140866400, 1140866402, 1140866404, 1140866406, 1140866408, 1140866410,
    1140866412, 1140866416, 1140866418, 1140866420, 1140866422, 1140866426,
    1140866438, 1140866440, 1140866442, 1140866444, 1140866446, 1140866448,
    1140866450, 1140866460, 1140866466, 1140866484, 1140866506, 1140866546,
    1140866554, 1140866692, 1140866704, 1140866712, 1140866724, 1140866726,
    1140866738, 1140866756, 1140866758, 1140866764, 1140866766, 1140866778,
    1140866782, 1140866784, 1140866798, 1140866800, 1140866802, 1140866804,
    1140875808, 1140879758, 1140879760, 1140879762, 1140879778, 1140879782,
    1140879786, 1140879794, 1140879798, 1140879802, 1140879806, 1140879810,
    1140879818, 1140879822, 1140879824, 1140879826, 1140879830, 1140879834,
    1140879842, 1140879854, 1140879866, 1140888510, 1140888512, 1140888552,
    1140888556, 1140888560, 1140888578, 1140888582, 1140888586, 1140888646,
    1140888686, 1140888760, 1140888762, 1140909368, 1140911698, 1140916356,
    1140916362, 1140917428, 1140923572, 1140923712, 1140923718, 1140926778,
    1140926780, 1141145658, 1141145660, 1141145668, 1141151016, 1141151018,
    1141151382, 1141152600, 1141152998, 1141153006, 1141153026, 1141153032,
    1141153328, 1141156754, 1141156808, 1141156836, 1141156846, 1141157252,
    1141157254, 1141164148, 1141164154, 1141164276, 1141164280, 1141165470,
    1141165476, 1141166006, 1141167822, 1141167832, 1141171152, 1141171336,
    1141171344, 1141172682, 1141172686, 1141172698, 1141173888, 1141180592,
    1141180598, 1141187788, 1141187790, 1141190160, 1141192064, 1141193282,
    1141193346, 1141194794, 1141194800, 1141194804, 1141194808, 1141194810,
    1141201038, 1141201040]
antipsychotics = [
    1140867420, 1140867432, 1140867444, 1140927956, 1140927970,
	1140928916, 1141152848, 1141152860, 1141153490, 1141167976,
	1141177762, 1141195974, 1141202024
]
erectile_dysfunction_drugs = [
    1140869100, 1140883010, 1141168936, 1141168944, 1141168946,
    1141168948, 1141187810, 1141187814, 1141187818, 1141192248, 
    1141192256, 1141192258, 1141192260
]
statins = [1141146234, 1140888594, 1140888648, 1141192410, 1140861958]
steroids = [
    1140853854,	1140854694,	1140854700,	1140854784,	1140854788,	1140854816,
    1140854834,	1140854888,	1140854916,	1140854990,	1140857672,	1140857678,
    1140862572,	1140868364,	1140868370,	1140873620,	1140874790,	1140874792,
    1140874794,	1140874810,	1140874814,	1140874816,	1140874822,	1140874896,
    1140874930,	1140874936,	1140874940,	1140874944,	1140874950,	1140874954,
    1140874956,	1140874976,	1140874978,	1140875668,	1140875684,	1140876032,
    1140876036,	1140876044,	1140876046,	1140876052,	1140876058,	1140876076,
    1140876104,	1140876456,	1140878562,	1140879922,	1140879934,	1140881938,
    1140882152,	1140882622,	1140882624,	1140882626,	1140882630,	1140882694,
    1140882708,	1140882718,	1140882722,	1140882724,	1140882728,	1140882730,
    1140882732,	1140882740,	1140882742,	1140882756,	1140882758,	1140882764,
    1140882766,	1140882768,	1140882774,	1140882776,	1140882778,	1140882780,
    1140882782,	1140882794,	1140882800,	1140882806,	1140882808,	1140882816,
    1140882818,	1140882820,	1140882822,	1140882824,	1140882826,	1140882830,
    1140882832,	1140882836,	1140882840,	1140882842,	1140882844,	1140882846,
    1140882848,	1140882850,	1140882852,	1140882864,	1140882888,	1140882892,
    1140882894,	1140882896,	1140882898,	1140882902,	1140882904,	1140882906,
    1140882908,	1140882910,	1140882914,	1140882916,	1140882918,	1140882920,
    1140882926,	1140882928,	1140882932,	1140882934,	1140882938,	1140883022,
    1140883026,	1140883028,	1140883030,	1140883034,	1140883038,	1140883040,
    1140883044,	1140883048,	1140883052,	1140883054,	1140883056,	1140883058,
    1140883060,	1140883062,	1140883064,	1140884636,	1140884640,	1140884642,
    1140884646,	1140884654,	1140884660,	1140884664,	1140884672,	1140884676,
    1140884696,	1140884700,	1140884704,	1140884716,	1140888074,	1140888092,
    1140888098,	1140888124,	1140888130,	1140888134,	1140888142,	1140888150,
    1140888166,	1140888168,	1140888172,	1140888176,	1140888178,	1140888184,
    1140888194,	1140909786,	1140909894,	1140910424,	1140910634,	1141151424,
    1141157294,	1141157402,	1141157418,	1141162532,	1141164086,	1141167174,
    1141169844,	1141173346,	1141174512,	1141174520,	1141174548,	1141174552,
    1141179072,	1141179982,	1141180342,	1141181062,	1141181554,	1141181610,
    1141189464, 1141191748, 1141194840, 1141195232, 1141195280
]

def binarize_column(values, ori_colname, colname):
    qrisk_df[colname] = 0

    mask = np.repeat(False, len(qrisk_df))

    for v in values:
        curr_mask = [(str(v) in x) for x in qrisk_df[ori_colname].values]
        curr_mask = np.array(curr_mask)
        mask = mask | curr_mask

    qrisk_df.loc[mask, colname] = 1

binarize_column(antihypertensives, 'medications', 'b_treatedhyp')
binarize_column(antipsychotics, 'medications', 'b_atypicalantipsy')
binarize_column(erectile_dysfunction_drugs, 'medications', 'erectile_dysfunction')
binarize_column(statins, 'medications', 'statin')
binarize_column(steroids, 'medications', 'b_corticosteroids')

# Statin treatment
# 0 "No Statin"
# 1 "Atorvastatin"
# 2 "Fluvastatin"
# 3 "Pravastatin"
# 4 "Rosuvastatin"
# 5 "Simvastatin"
qrisk_df['statin_category'] = 0

for i, statin in enumerate(statins):
    qrisk_df.loc[qrisk_df['medications'].str.contains(statin, regex=False), 'statin_category'] = i + 1

In [None]:
# diseases
binarize_column(["M05"], 'icd10_diagnoses', 'b_ra_icd10')
binarize_column(["E10"], 'icd10_diagnoses', 'b_type1_icd10')
binarize_column(["E11"], 'icd10_diagnoses', 'b_type2_icd10')
binarize_column(["M329"], 'icd10_diagnoses', 'b_sle_icd10')
binarize_column(["I48"], 'icd10_diagnoses', 'b_AF_icd10')
binarize_column(["N183", "N184", "N185"], 'icd10_diagnoses', 'b_renal_icd10')
binarize_column(["G43"], 'icd10_diagnoses', 'b_migraine_icd10')
binarize_column(["B20"], 'icd10_diagnoses', 'b_hiv_tochange_icd10')
binarize_column(["N52"], 'icd10_diagnoses', 'b_impotence2_icd10')
binarize_column(["F20", "F23", "F31", "F32", "F33"], 'icd10_diagnoses', 'b_semi_icd10')


binarize_column(["714"], 'icd9_diagnoses', 'b_ra_icd9')
binarize_column(["2500", "25001", "2501", "25011", "25013", "2504", 
                 "25041", "25043", "2505", "25051", "25053", "2506", 
                 "25061", "25063", "2507", "25071", "25073", "2509", 
                 "25091", "25093"], 'icd9_diagnoses', 'b_type1_icd9')
binarize_column(["2500", "25000", "25002", "2501", "25010", "25012", 
                 "2504", "25040", "25042", "2505", "25050", "25052",
                 "2506", "25062", "2507", "25070", "25072", "2509", 
                 "25090", "25092"], 'icd9_diagnoses', 'b_type2_icd9')
binarize_column(["710"], 'icd9_diagnoses', 'b_sle_icd9')
binarize_column(["42731"], 'icd9_diagnoses', 'b_AF_icd9')
binarize_column(["5853", "5854", "5855"], 'icd9_diagnoses', 'b_renal_icd9')
binarize_column(["346"], 'icd9_diagnoses', 'b_migraine_icd9')
binarize_column(["042", "79571"], 'icd9_diagnoses', 'b_hiv_tochange_icd9')
binarize_column(["60784"], 'icd9_diagnoses', 'b_impotence2_icd9')
binarize_column(["2953", "29581", "29582", "2964", "2965", 
                 "2967", "2968", "2962", "2963"], 'icd9_diagnoses', 'b_semi_icd9')

qrisk_df["b_ra"] = (qrisk_df["b_ra_icd9"] + qrisk_df["b_ra_icd10"]).astype(bool).astype(int)
qrisk_df["b_type1"] = (qrisk_df["b_type1_icd9"] + qrisk_df["b_type1_icd10"]).astype(bool).astype(int)
qrisk_df["b_type2"] = (qrisk_df["b_type2_icd9"] + qrisk_df["b_type2_icd10"]).astype(bool).astype(int)
qrisk_df["b_sle"] = (qrisk_df["b_sle_icd9"] + qrisk_df["b_sle_icd10"]).astype(bool).astype(int)
qrisk_df["b_AF"] = (qrisk_df["b_AF_icd9"] + qrisk_df["b_AF_icd10"]).astype(bool).astype(int)
qrisk_df["b_renal"] = (qrisk_df["b_renal_icd9"] + qrisk_df["b_renal_icd10"]).astype(bool).astype(int)
qrisk_df["b_migraine"] = (qrisk_df["b_migraine_icd9"] + qrisk_df["b_migraine_icd10"]).astype(bool).astype(int)
qrisk_df["b_hiv_tochange"] = (qrisk_df["b_hiv_tochange_icd9"] + qrisk_df["b_hiv_tochange_icd10"]).astype(bool).astype(int)
qrisk_df["b_impotence2"] = (qrisk_df["b_impotence2_icd9"] + qrisk_df["b_impotence2_icd10"]).astype(bool).astype(int)
qrisk_df["b_semi"] = (qrisk_df["b_semi_icd9"] + qrisk_df["b_semi_icd10"]).astype(bool).astype(int)

In [None]:
# Age & BMI: adjust units
qrisk_df["dage"] = qrisk_df["age"]/10
qrisk_df["dbmi"] = qrisk_df["bmi"]/10

In [None]:
# smoke_cat
# 0 "Never"
# 1 "Former"
# 2 "Light 1-9/day"
# 3 "Moderate 10-19/day"
# 4 "Heavy >20/day"
qrisk_df["smoke_cat"] = np.nan
qrisk_df.loc[df["p20116_i0"] == 0, "smoke_cat"] = 0
qrisk_df.loc[(df["p20116_i0"] == 1) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 1
qrisk_df.loc[(df["p3456_i0"] >= -10) & (df["p3456_i0"] <= 9) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 2
qrisk_df.loc[(df["p3456_i0"] >= 10) & (df["p3456_i0"] <= 19) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 3
qrisk_df.loc[(df["p3456_i0"] >= 20) & (qrisk_df["smoke_cat"].isna()), "smoke_cat"] = 4

In [None]:
family_history = qrisk_df["sibling_illnesses"].str.contains('1', regex=False) | \
     qrisk_df["mother_illnesses"].str.contains('1', regex=False) | \
     qrisk_df["father_illnesses"].str.contains('1', regex=False)
qrisk_df["family_history_cvd"] = 0
qrisk_df.loc[family_history, "family_history_cvd"] = 1

In [None]:
# ethnicity
# 1 "white"
# 2 "Indian"
# 3 "Pakistani"
# 4 "Bangladeshi"
# 5 "Other Asian"
# 6 "Black Caribbean"
# 7 "Black African"
# 8 "Chinese"
# 9 "Other"
qrisk_df["ethnicity"] = np.nan
qrisk_df.loc[df["p21000_i0"].isin([1.0, 1001, 1002, 1003]), "ethnicity"] = 1
qrisk_df.loc[df["p21000_i0"].isin([3001]), "ethnicity"] = 2
qrisk_df.loc[df["p21000_i0"].isin([3002]), "ethnicity"] = 3
qrisk_df.loc[df["p21000_i0"].isin([3.0, 3004, 2003]), "ethnicity"] = 5
qrisk_df.loc[df["p21000_i0"].isin([2001, 4001]), "ethnicity"] = 6
qrisk_df.loc[df["p21000_i0"].isin([2002, 4002]), "ethnicity"] = 7
qrisk_df.loc[df["p21000_i0"].isin([5.0]), "ethnicity"] = 8
qrisk_df.loc[df["p21000_i0"].isin([2.0, 2004, 4.0, 6.0, 4003]), "ethnicity"] = 9
# Unaccounted for: -1, -3, -4, 1.4, 3.3

# Compute QRISK female

In [None]:
# Applying the fractional polynomial transforms (which includes scaling)
qrisk_df["age_1_f"] = qrisk_df["dage"]**(-2)
qrisk_df["age_2_f"] = qrisk_df["dage"]
qrisk_df["bmi_1_f"] = qrisk_df["dbmi"]**(-2)
qrisk_df["bmi_2_f"] = qrisk_df["dbmi"]**(-2)*np.log(qrisk_df["dbmi"])

# Centring the continuous variables
qrisk_df["age_1_f"] = qrisk_df["age_1_f"] - 0.053274843841791
qrisk_df["age_2_f"] = qrisk_df["age_2_f"] - 4.332503318786621
qrisk_df["bmi_1_f"] = qrisk_df["bmi_1_f"] - 0.154946178197861
qrisk_df["bmi_2_f"] = qrisk_df["bmi_2_f"] - 0.144462317228317
qrisk_df["rati_f"] = qrisk_df["rati"] - 3.476326465606690
qrisk_df["systolic_blood_pressure_f"] = qrisk_df["systolic_blood_pressure"] - 123.130012512207030
qrisk_df["sbps5_f"] = qrisk_df["sd_sbp"] - 9.002537727355957
qrisk_df["tdi_f"] = qrisk_df["tdi"] - 0.392308831214905

In [None]:
# ethnicity
qrisk_df["qrisk_f"] = 0
qrisk_df.loc[qrisk_df["ethnicity"] == 2, "qrisk_f"] += 0.28040314332995425
qrisk_df.loc[qrisk_df["ethnicity"] == 3, "qrisk_f"] += 0.56298994142075398
qrisk_df.loc[qrisk_df["ethnicity"] == 4, "qrisk_f"] += 0.29590000851116516
qrisk_df.loc[qrisk_df["ethnicity"] == 5, "qrisk_f"] += 0.072785379877982545
qrisk_df.loc[qrisk_df["ethnicity"] == 6, "qrisk_f"] += -0.17072135508857317
qrisk_df.loc[qrisk_df["ethnicity"] == 7, "qrisk_f"] += -0.39371043314874971
qrisk_df.loc[qrisk_df["ethnicity"] == 8, "qrisk_f"] += -0.32632495283530272
qrisk_df.loc[qrisk_df["ethnicity"] == 9, "qrisk_f"] += -0.17127056883241784

In [None]:
# smoke
qrisk_df.loc[qrisk_df["smoke_cat"] == 1, "qrisk_f"] += 0.13386833786546262
qrisk_df.loc[qrisk_df["smoke_cat"] == 2, "qrisk_f"] += 0.56200858012438537
qrisk_df.loc[qrisk_df["smoke_cat"] == 3, "qrisk_f"] += 0.66749593377502547
qrisk_df.loc[qrisk_df["smoke_cat"] == 4, "qrisk_f"] += 0.84948177644830847

In [None]:
# continuous variables
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * -8.1388109247726188
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * 0.79733376689699098
qrisk_df["qrisk_f"] += qrisk_df["bmi_1_f"] * 0.29236092275460052
qrisk_df["qrisk_f"] += qrisk_df["bmi_2_f"] * -4.1513300213837665
qrisk_df["qrisk_f"] += qrisk_df["rati_f"] * 0.15338035820802554
qrisk_df["qrisk_f"] += qrisk_df["systolic_blood_pressure_f"] * 0.013131488407103424
qrisk_df["qrisk_f"] += qrisk_df["sbps5_f"] * 0.0078894541014586095
qrisk_df["qrisk_f"] += qrisk_df["tdi_f"] * 0.077223790588590108

In [None]:
# medications and diseases
qrisk_df["qrisk_f"] += qrisk_df["b_AF"] * 1.5923354969269663
qrisk_df["qrisk_f"] += qrisk_df["b_atypicalantipsy"] * 0.25237642070115557
qrisk_df["qrisk_f"] += qrisk_df["b_corticosteroids"] * 0.59520725304601851
qrisk_df["qrisk_f"] += qrisk_df["b_migraine"] * 0.301267260870345
qrisk_df["qrisk_f"] += qrisk_df["b_ra"] * 0.21364803435181942
qrisk_df["qrisk_f"] += qrisk_df["b_renal"] * 0.65194569493845833
qrisk_df["qrisk_f"] += qrisk_df["b_semi"] * 0.12555308058820178
qrisk_df["qrisk_f"] += qrisk_df["b_sle"] * 0.75880938654267693
qrisk_df["qrisk_f"] += qrisk_df["b_treatedhyp"] * 0.50931593683423004
qrisk_df["qrisk_f"] += qrisk_df["b_type1"] * 1.7267977510537347
qrisk_df["qrisk_f"] += qrisk_df["b_type2"] * 1.0688773244615468
qrisk_df["qrisk_f"] += qrisk_df["family_history_cvd"] * 0.45445319020896213

In [None]:
# smoking
qrisk_df.loc[qrisk_df["smoke_cat"] == 1, "qrisk_f"] += qrisk_df["age_1_f"] * -4.7057161785851891000000000
qrisk_df.loc[qrisk_df["smoke_cat"] == 2, "qrisk_f"] += qrisk_df["age_1_f"] * -2.7430383403573337000000000
qrisk_df.loc[qrisk_df["smoke_cat"] == 3, "qrisk_f"] += qrisk_df["age_1_f"] * -0.8660808882939218200000000
qrisk_df.loc[qrisk_df["smoke_cat"] == 4, "qrisk_f"] += qrisk_df["age_1_f"] *  0.9024156236971064800000000

In [None]:
# interaction terms
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_AF"] * 19.938034889546561
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_corticosteroids"] * -0.98408045235936281
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_migraine"] * 1.7634979587872999
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_renal"] * -3.5874047731694114
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_sle"] * 19.690303738638292
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_treatedhyp"] * 11.872809733921812
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_type1"] * -1.2444332714320747
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["b_type2"] * 6.8652342000009599
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["bmi_1_f"] * 23.802623412141742
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["bmi_2_f"] * -71.184947692087007
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["family_history_cvd"] * 0.99467807940435127
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["systolic_blood_pressure"] * 0.034131842338615485
qrisk_df["qrisk_f"] += qrisk_df["age_1_f"] * qrisk_df["tdi"] * -1.0301180802035639

qrisk_df.loc[qrisk_df["smoke_cat"] == 1, "qrisk_f"] += qrisk_df["age_2_f"] * -0.075589244643193026
qrisk_df.loc[qrisk_df["smoke_cat"] == 2, "qrisk_f"] += qrisk_df["age_2_f"] * -0.11951192874867074
qrisk_df.loc[qrisk_df["smoke_cat"] == 3, "qrisk_f"] += qrisk_df["age_2_f"] * -0.10366306397571923
qrisk_df.loc[qrisk_df["smoke_cat"] == 4, "qrisk_f"] += qrisk_df["age_2_f"] * -0.13991853591718389

qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_AF"] * -0.076182651011162505
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_corticosteroids"] * -0.12005364946742472
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_migraine"] * -0.065586917898699859
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_renal"] * -0.22688873086442507
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_sle"] * 0.077347949679016273
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_treatedhyp"] * 0.00096857823588174436
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_type1"] * -0.28724064624488949
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["b_type2"] * -0.097112252590695489
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["bmi_1_f"] * 0.52369958933664429
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["bmi_2_f"] * 0.045744190122323759
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["family_history_cvd"] * -0.076885051698423038
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["systolic_blood_pressure"] * -0.0015082501423272358
qrisk_df["qrisk_f"] += qrisk_df["age_2_f"] * qrisk_df["tdi"] * -0.031593414674962329 

In [None]:
# compute risk
survivor = 0.988876402378082
qrisk_df["qrisk_f"] = 1 - survivor**np.exp(qrisk_df["qrisk_f"])

# Compute QRISK3 male

In [None]:
# applying the fractional polynomial transforms (which includes scaling)
qrisk_df["age_1_m"] = qrisk_df["dage"]**(-1)
qrisk_df["age_2_m"] = qrisk_df["dage"]**3
qrisk_df["bmi_1_m"] = qrisk_df["dbmi"]**(-2)
qrisk_df["bmi_2_m"] = qrisk_df["dbmi"]**(-2)*np.log(qrisk_df["dbmi"])

# center the continuous variables
qrisk_df["age_1_m"] = qrisk_df["age_1_m"] - 0.234766781330109
qrisk_df["age_2_m"] = qrisk_df["age_2_m"] - 77.284080505371094
qrisk_df["bmi_1_m"] = qrisk_df["bmi_1_m"] - 0.149176135659218
qrisk_df["bmi_2_m"] = qrisk_df["bmi_2_m"] - 0.141913309693336
qrisk_df["rati_m"] = qrisk_df["rati"] - 4.300998687744141
qrisk_df["systolic_blood_pressure_m"] = qrisk_df["systolic_blood_pressure"] - 128.571578979492190
qrisk_df["sbps5_m"] = qrisk_df["sd_sbp"] - 8.756621360778809
qrisk_df["tdi_m"] = qrisk_df["tdi"] - 0.526304900646210

In [None]:
# ethnicity
qrisk_df["qrisk_m"] = 0
qrisk_df.loc[qrisk_df["ethnicity"] == 2, "qrisk_m"] += 0.27719248760308279
qrisk_df.loc[qrisk_df["ethnicity"] == 3, "qrisk_m"] += 0.47446360714931268
qrisk_df.loc[qrisk_df["ethnicity"] == 4, "qrisk_m"] += 0.52961729919689371
qrisk_df.loc[qrisk_df["ethnicity"] == 5, "qrisk_m"] += 0.035100159186299017   
qrisk_df.loc[qrisk_df["ethnicity"] == 6, "qrisk_m"] += -0.35807899669327919
qrisk_df.loc[qrisk_df["ethnicity"] == 7, "qrisk_m"] += -0.4005648523216514
qrisk_df.loc[qrisk_df["ethnicity"] == 8, "qrisk_m"] += -0.41522792889830173  
qrisk_df.loc[qrisk_df["ethnicity"] == 9, "qrisk_m"] += -0.26321348134749967

In [None]:
# smoking
qrisk_df.loc[qrisk_df["smoke_cat"] == 1, "qrisk_m"] += 0.19128222863388983
qrisk_df.loc[qrisk_df["smoke_cat"] == 2, "qrisk_m"] += 0.55241588192645552
qrisk_df.loc[qrisk_df["smoke_cat"] == 3, "qrisk_m"] += 0.63835053027506072
qrisk_df.loc[qrisk_df["smoke_cat"] == 4, "qrisk_m"] += 0.78983819881858019

In [None]:
# continuous variables
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * -17.839781666005575
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * 0.0022964880605765492
qrisk_df["qrisk_m"] += qrisk_df["bmi_1_m"] * 2.4562776660536358
qrisk_df["qrisk_m"] += qrisk_df["bmi_2_m"] * -8.3011122314711354
qrisk_df["qrisk_m"] += qrisk_df["rati_m"] * 0.17340196856327111
qrisk_df["qrisk_m"] += qrisk_df["systolic_blood_pressure_m"] * 0.012910126542553305
qrisk_df["qrisk_m"] += qrisk_df["sbps5_m"] * 0.010251914291290456
qrisk_df["qrisk_m"] += qrisk_df["tdi_m"] * 0.033268201277287295

In [None]:
# medications and diseases
qrisk_df["qrisk_m"] += qrisk_df["b_AF"] * 0.88209236928054657
qrisk_df["qrisk_m"] += qrisk_df["b_atypicalantipsy"] * 0.13046879855173513
qrisk_df["qrisk_m"] += qrisk_df["b_corticosteroids"] * 0.45485399750445543
qrisk_df["qrisk_m"] += qrisk_df["b_impotence2"] * 0.22251859086705383
qrisk_df["qrisk_m"] += qrisk_df["b_migraine"] * 0.25584178074159913
qrisk_df["qrisk_m"] += qrisk_df["b_ra"] * 0.20970658013956567
qrisk_df["qrisk_m"] += qrisk_df["b_renal"] * 0.71853261288274384
qrisk_df["qrisk_m"] += qrisk_df["b_semi"] * 0.12133039882047164
qrisk_df["qrisk_m"] += qrisk_df["b_sle"] * 0.4401572174457522
qrisk_df["qrisk_m"] += qrisk_df["b_treatedhyp"] * 0.51659871082695474
qrisk_df["qrisk_m"] += qrisk_df["b_type1"] * 1.2343425521675175
qrisk_df["qrisk_m"] += qrisk_df["b_type2"] * 0.85942071430932221
qrisk_df["qrisk_m"] += qrisk_df["family_history_cvd"] * 0.54055469009390156

In [None]:
# interactions
qrisk_df.loc[qrisk_df["smoke_cat"] == 1, "qrisk_m"] += qrisk_df["age_1_m"] * -0.21011133933516346
qrisk_df.loc[qrisk_df["smoke_cat"] == 2, "qrisk_m"] += qrisk_df["age_1_m"] * 0.7526867644750319
qrisk_df.loc[qrisk_df["smoke_cat"] == 3, "qrisk_m"] += qrisk_df["age_1_m"] * 0.9931588755640579
qrisk_df.loc[qrisk_df["smoke_cat"] == 4, "qrisk_m"] += qrisk_df["age_1_m"] * 2.1331163414389076

qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_AF"] * 3.4896675530623207
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_corticosteroids"] * 1.1708133653489108
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_impotence2"] * -1.506400985745431
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_migraine"] * 2.349115987140244
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_renal"] * -0.50656716327223694
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_treatedhyp"] * 6.511458109853267
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_type1"] * 5.337986487800653
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["b_type2"] * 3.646181740622131
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["bmi_1_m"] * 31.004952956033886
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["bmi_2_m"] * -111.29157184391643
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["family_history_cvd"] * 2.7808628508531887
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["systolic_blood_pressure_m"] * 0.018858524469865853
qrisk_df["qrisk_m"] += qrisk_df["age_1_m"] * qrisk_df["tdi_m"] * -0.1007554870063731

qrisk_df.loc[qrisk_df["smoke_cat"] == 1, "qrisk_m"] += qrisk_df["age_2_m"] * -0.00049854870275326121
qrisk_df.loc[qrisk_df["smoke_cat"] == 2, "qrisk_m"] += qrisk_df["age_2_m"] * -0.00079875633317385414
qrisk_df.loc[qrisk_df["smoke_cat"] == 3, "qrisk_m"] += qrisk_df["age_2_m"] * -0.00083706184266251296
qrisk_df.loc[qrisk_df["smoke_cat"] == 4, "qrisk_m"] += qrisk_df["age_2_m"] * -0.00078400319155637289

qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_AF"] * -0.00034995608340636049
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_corticosteroids"] * -0.0002496045095297166
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_impotence2"] * -0.0011058218441227373
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_migraine"] * 0.00019896446041478631
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_renal"] * -0.0018325930166498813
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_treatedhyp"] * 0.00063838053104165013
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_type1"] * 0.0006409780808752897
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["b_type2"] * -0.00024695695588868315
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["bmi_1_m"] * 0.0050380102356322029
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["bmi_2_m"] * -0.013074483002524319
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["family_history_cvd"] * -0.00024791809907396037
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["systolic_blood_pressure_m"] * -0.00001271874191588457
qrisk_df["qrisk_m"] += qrisk_df["age_2_m"] * qrisk_df["tdi_m"] * -0.000093299642323272888

In [None]:
# compute risk
survivor = 0.977268040180206
qrisk_df["qrisk_m"] = 1 - survivor**np.exp(qrisk_df["qrisk_m"])

## Final QRISK3

In [None]:
qrisk_df['y_score'] = np.where(qrisk_df['sex'] == 1, qrisk_df["qrisk_m"], qrisk_df["qrisk_f"])
score = qrisk_df[['sex', 'y_score']].copy()

# QC

In [None]:
# compute auc
from sklearn.metrics import roc_auc_score

y_true = df.ascvd_10yr_label.values
y_score = score['y_score'].values

mask = ~np.isnan(y_true) & ~np.isnan(y_score)
y_true = y_true[mask]
y_score = y_score[mask]

# 0.7330483222312824
roc_auc_score(y_true, y_score)

# Save

In [None]:
# compute top quantile by sex as threshold
thresholds = score[['sex', 'y_score']].groupby("sex").aggregate(percentile(0.95))
thresholds = thresholds.reset_index()
thresholds.columns = ["sex", "threshold"]

score = score.reset_index().merge(thresholds, on="sex", how="left").set_index('IID')
score["y_pred"] = (score["y_score"] > score["threshold"]).astype(int)

score = score[['y_score', 'y_pred']]
save_results(score, "results/qrisk3")