## **Machine Learning-Based Prediction of Cardiovascular Disease Risk Using Lifestyle Factors**
by Le Ngoc Uyen Phuong (ITDSIU20079)

# **0. Libraries Import**

In [1]:
import os
os.chdir('./DATA/NHANES/')

In [3]:
import pandas as pd
import numpy as np
from tabulate import tabulate

# **1. Data Collection**

## **1.1. Demographics**


In [4]:
# Demographics dataset import
demo_1112 = pd.read_sas('DEMO_G.XPT')
demo_1314 = pd.read_sas('DEMO_H.XPT')
demo_1516 = pd.read_sas('DEMO_I.XPT')
demo_1718 = pd.read_sas('DEMO_J.XPT')
demo_1720 = pd.read_sas('P_DEMO.XPT')
demo = pd.concat([demo_1112, demo_1314, demo_1516, demo_1718, demo_1720], ignore_index=True)
print('Duplicated data points:', demo[demo.duplicated()])
print('\nDemographics dataset:')
demo

Duplicated data points: Empty DataFrame
Columns: [SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDAGEMN, RIDRETH1, RIDRETH3, RIDEXMON, RIDEXAGY, RIDEXAGM, DMQMILIZ, DMQADFC, DMDBORN4, DMDCITZN, DMDYRSUS, DMDEDUC3, DMDEDUC2, DMDMARTL, RIDEXPRG, SIALANG, SIAPROXY, SIAINTRP, FIALANG, FIAPROXY, FIAINTRP, MIALANG, MIAPROXY, MIAINTRP, AIALANGA, WTINT2YR, WTMEC2YR, SDMVPSU, SDMVSTRA, INDHHIN2, INDFMIN2, INDFMPIR, DMDHHSIZ, DMDFMSIZ, DMDHHSZA, DMDHHSZB, DMDHHSZE, DMDHRGND, DMDHRAGE, DMDHRBR4, DMDHREDU, DMDHRMAR, DMDHSEDU, DMDHRAGZ, DMDHREDZ, DMDHRMAZ, DMDHSEDZ, DMDYRUSZ, DMDMARTZ, WTINTPRP, WTMECPRP]
Index: []

[0 rows x 56 columns]

Demographics dataset:


Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGY,...,DMDHRMAR,DMDHSEDU,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,DMDYRUSZ,DMDMARTZ,WTINTPRP,WTMECPRP
0,62161.0,7.0,2.0,1.0,22.0,,3.0,3.0,2.0,,...,1.0,5.0,,,,,,,,
1,62162.0,7.0,2.0,2.0,3.0,,1.0,1.0,1.0,3.0,...,6.0,,,,,,,,,
2,62163.0,7.0,2.0,1.0,14.0,,5.0,6.0,2.0,14.0,...,1.0,4.0,,,,,,,,
3,62164.0,7.0,2.0,2.0,44.0,,3.0,3.0,1.0,,...,1.0,4.0,,,,,,,,
4,62165.0,7.0,2.0,2.0,14.0,,4.0,4.0,2.0,14.0,...,77.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54711,124818.0,66.0,2.0,1.0,40.0,,4.0,4.0,1.0,,...,,,,,,,,1.0,21586.596728,21666.889837
54712,124819.0,66.0,2.0,1.0,2.0,,4.0,4.0,2.0,,...,,,,,,,,,1664.919253,1838.169709
54713,124820.0,66.0,2.0,2.0,7.0,,3.0,3.0,2.0,,...,,,,,,,,,14819.783161,16497.806674
54714,124821.0,66.0,2.0,1.0,63.0,,4.0,4.0,1.0,,...,,,,,,,,2.0,4666.817952,4853.430230


## **1.2. Examination**

In [5]:
# Blood Pressure
BPX_G = pd.read_sas('BPX_G.XPT')
BPX_H = pd.read_sas('BPX_H.XPT')
BPX_I = pd.read_sas('BPX_I.XPT')
P_BPXO = pd.read_sas('P_BPXO.XPT')
BPX_J = pd.read_sas('BPX_J.XPT')
BPXO_J = pd.read_sas('BPXO_J.XPT')
BPX_J = pd.merge(BPX_J, BPXO_J, how='left', on='SEQN')

bp = pd.concat([BPX_G, BPX_H, BPX_I, BPX_J, P_BPXO], ignore_index=True)

# Body Measures
bmx_1112= pd.read_sas('BMX_G.XPT')
bmx_1314= pd.read_sas('BMX_H.XPT')
bmx_1516= pd.read_sas('BMX_I.XPT')
bmx_1718= pd.read_sas('BMX_J.XPT')
bmx_1720= pd.read_sas('P_BMX.XPT')
bmx = pd.concat([bmx_1112, bmx_1314, bmx_1516, bmx_1718, bmx_1720], ignore_index=True)

# Examination dataset
exam = pd.merge(bp,bmx,how='right',on='SEQN')
print('Duplicated data points:', exam[exam.duplicated()])
print('\nExamination dataset:')
exam

Duplicated data points: Empty DataFrame
Columns: [SEQN, PEASCST1, PEASCTM1, PEASCCT1, BPXCHR, BPQ150A, BPQ150B, BPQ150C, BPQ150D, BPAARM, BPACSZ, BPXPLS, BPXPULS, BPXPTY, BPXML1, BPXSY1, BPXDI1, BPAEN1, BPXSY2, BPXDI2, BPAEN2, BPXSY3, BPXDI3, BPAEN3, BPXSY4, BPXDI4, BPAEN4, BPAOARM, BPAOCSZ, BPAOMNTS, BPXOSY1, BPXODI1, BPXOSY2, BPXODI2, BPXOSY3, BPXODI3, BPXOPLS1, BPXOPLS2, BPXOPLS3, BMDSTATS, BMXWT, BMIWT, BMXRECUM, BMIRECUM, BMXHEAD, BMIHEAD, BMXHT, BMIHT, BMXBMI, BMDBMIC, BMXLEG, BMILEG, BMXARML, BMIARML, BMXARMC, BMIARMC, BMXWAIST, BMIWAIST, BMXSAD1, BMXSAD2, BMXSAD3, BMXSAD4, BMDAVSAD, BMDSADCM, BMXHIP, BMIHIP]
Index: []

[0 rows x 66 columns]

Examination dataset:


Unnamed: 0,SEQN,PEASCST1,PEASCTM1,PEASCCT1,BPXCHR,BPQ150A,BPQ150B,BPQ150C,BPQ150D,BPAARM,...,BMXWAIST,BMIWAIST,BMXSAD1,BMXSAD2,BMXSAD3,BMXSAD4,BMDAVSAD,BMDSADCM,BMXHIP,BMIHIP
0,62161.0,1.0,596.0,,,2.0,2.0,2.0,2.0,1.0,...,81.0,,17.7,17.9,,,17.8,,,
1,62162.0,1.0,64.0,,100.0,,,,,,...,45.4,,,,,,,,,
2,62163.0,1.0,788.0,,,1.0,2.0,2.0,2.0,1.0,...,64.6,,15.6,15.5,,,15.6,,,
3,62164.0,1.0,527.0,,,1.0,2.0,2.0,2.0,1.0,...,80.1,,18.3,18.5,,,18.4,,,
4,62165.0,1.0,468.0,,,2.0,2.0,2.0,2.0,1.0,...,86.7,,21.0,20.8,,,20.9,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51694,124818.0,,,,,,,,,,...,114.7,,,,,,,,118.0,
51695,124819.0,,,,,,,,,,...,48.4,,,,,,,,,
51696,124820.0,,,,,,,,,,...,57.5,,,,,,,,,
51697,124821.0,,,,,,,,,,...,97.1,,,,,,,,99.8,


## **1.3. Laboratory**

In [6]:
# Fasting Questionnaire
FASTQX_G = pd.read_sas('FASTQX_G.XPT')
FASTQX_H = pd.read_sas('FASTQX_H.XPT')
FASTQX_I = pd.read_sas('FASTQX_I.XPT')
FASTQX_J = pd.read_sas('FASTQX_J.XPT')
P_FASTQX = pd.read_sas('P_FASTQX.XPT')
f = pd.concat([FASTQX_G, FASTQX_H, FASTQX_I, FASTQX_J, P_FASTQX], ignore_index=True)

# Standard Biochemistry Profile
BIOPRO_G = pd.read_sas('BIOPRO_G.XPT')
BIOPRO_H = pd.read_sas('BIOPRO_H.XPT')
BIOPRO_I = pd.read_sas('BIOPRO_I.XPT')
BIOPRO_J = pd.read_sas('BIOPRO_J.XPT')
P_BIOPRO = pd.read_sas('P_BIOPRO.XPT')
b = pd.concat([BIOPRO_G, BIOPRO_H, BIOPRO_I, BIOPRO_J, P_BIOPRO], ignore_index=True)

# Cholesterol - Total
TCHOL_G = pd.read_sas('TCHOL_G.XPT')
TCHOL_H = pd.read_sas('TCHOL_H.XPT')
TCHOL_I = pd.read_sas('TCHOL_I.XPT')
TCHOL_J = pd.read_sas('TCHOL_J.XPT')
P_TCHOL = pd.read_sas('P_TCHOL.XPT')
tchol = pd.concat([TCHOL_G, TCHOL_H, TCHOL_I, TCHOL_J, P_TCHOL], ignore_index=True)

# Cholesterol - HDL
HDL_G = pd.read_sas('HDL_G.XPT')
HDL_H = pd.read_sas('HDL_H.XPT')
HDL_I = pd.read_sas('HDL_I.XPT')
HDL_J = pd.read_sas('HDL_J.XPT')
P_HDL = pd.read_sas('P_HDL.XPT')
hdl = pd.concat([HDL_G, HDL_H, HDL_I, HDL_J, P_HDL], ignore_index=True)

# Cholesterol - LDL & Triglycerides
TRIGLY_G = pd.read_sas('TRIGLY_G.XPT')
TRIGLY_H = pd.read_sas('TRIGLY_H.XPT')
TRIGLY_I = pd.read_sas('TRIGLY_I.XPT')
TRIGLY_J = pd.read_sas('TRIGLY_J.XPT')
P_TRIGLY = pd.read_sas('P_TRIGLY.XPT')
trigly = pd.concat([TRIGLY_G, TRIGLY_H, TRIGLY_I, TRIGLY_J, P_TRIGLY], ignore_index=True)

# Laboratory dataset
lab = pd.merge(pd.merge(pd.merge(pd.merge(tchol,hdl,how='inner',on='SEQN'), trigly, how='left', on='SEQN'), b,how='left', on='SEQN'), f,how='left', on='SEQN')
print('Duplicated data points:', lab[lab.duplicated()])
print('\nLaboratory dataset:')
lab

Duplicated data points: Empty DataFrame
Columns: [SEQN, LBXTC, LBDTCSI, LBDHDD, LBDHDDSI, WTSAF2YR, LBXTR, LBDTRSI, LBDLDL, LBDLDLSI, LBDLDLM, LBDLDMSI, LBDLDLN, LBDLDNSI, WTSAFPRP, LBXSAL, LBDSALSI, LBXSATSI, LBXSASSI, LBXSAPSI, LBXSBU, LBDSBUSI, LBXSCA, LBDSCASI, LBXSCK, LBXSCH, LBDSCHSI, LBXSC3SI, LBXSCR, LBDSCRSI, LBXSGTSI, LBXSGL, LBDSGLSI, LBXSIR, LBDSIRSI, LBXSLDSI, LBXSPH, LBDSPHSI, LBXSTB, LBDSTBSI, LBXSTP, LBDSTPSI, LBXSUA, LBDSUASI, LBXSNASI, LBXSKSI, LBXSCLSI, LBXSOSSI, LBXSGB, LBDSGBSI, LBXSTR, LBDSTRSI, LBDSATLC, LBDSGTLC, LBDSTBLC, PHQ020, PHACOFHR, PHACOFMN, PHQ030, PHAALCHR, PHAALCMN, PHQ040, PHAGUMHR, PHAGUMMN, PHQ050, PHAANTHR, PHAANTMN, PHQ060, PHASUPHR, PHASUPMN, PHAFSTHR, PHAFSTMN, PHDSESN]
Index: []

[0 rows x 73 columns]

Laboratory dataset:


Unnamed: 0,SEQN,LBXTC,LBDTCSI,LBDHDD,LBDHDDSI,WTSAF2YR,LBXTR,LBDTRSI,LBDLDL,LBDLDLSI,...,PHAGUMMN,PHQ050,PHAANTHR,PHAANTMN,PHQ060,PHASUPHR,PHASUPMN,PHAFSTHR,PHAFSTMN,PHDSESN
0,62161.0,168.0,4.34,41.0,1.06,240011.713,84.0,0.948,110.0,2.845,...,,2.0,,,2.0,,,14.0,37.0,5.397605e-79
1,62163.0,154.0,3.98,44.0,1.14,,,,,,...,,2.0,,,2.0,,,17.0,55.0,1.000000e+00
2,62164.0,190.0,4.91,28.0,0.72,288182.780,56.0,0.632,151.0,3.905,...,,2.0,,,2.0,,,11.0,6.0,5.397605e-79
3,62165.0,161.0,4.16,63.0,1.63,32747.025,71.0,0.802,84.0,2.172,...,,2.0,,,2.0,,,12.0,11.0,5.397605e-79
4,62166.0,,,,,,,,,,...,,2.0,,,2.0,,,1.0,10.0,5.397605e-79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43761,124817.0,200.0,5.17,60.0,1.55,,,,,,...,,2.0,,,2.0,,,7.0,40.0,2.000000e+00
43762,124818.0,234.0,6.05,50.0,1.29,,,,,,...,,2.0,,,2.0,,,6.0,4.0,1.000000e+00
43763,124820.0,179.0,4.63,64.0,1.66,,,,,,...,,2.0,,,2.0,,,4.0,42.0,1.000000e+00
43764,124821.0,155.0,4.01,44.0,1.14,,51.0,0.576,101.0,2.612,...,,2.0,,,2.0,,,10.0,8.0,5.397605e-79


## **1.4. Questionnaire**

In [7]:
# Alcohol use
ALQ_G = pd.read_sas('ALQ_G.XPT')
ALQ_H = pd.read_sas('ALQ_H.XPT')
ALQ_I = pd.read_sas('ALQ_I.XPT')
ALQ_J = pd.read_sas('ALQ_J.XPT')
P_ALQ = pd.read_sas('P_ALQ.XPT')
al = pd.concat([ALQ_G, ALQ_H, ALQ_I, ALQ_J, P_ALQ], ignore_index=True)

# Blood Pressure & Cholesterol
BPQ_G = pd.read_sas('BPQ_G.XPT')
BPQ_H = pd.read_sas('BPQ_H.XPT')
BPQ_I = pd.read_sas('BPQ_I.XPT')
BPQ_J = pd.read_sas('BPQ_J.XPT')
P_BPQ = pd.read_sas('P_BPQ.XPT')
bc = pd.concat([BPQ_G, BPQ_H, BPQ_I, BPQ_J, P_BPQ], ignore_index=True)

# Cardiovascular Health
CDQ_G = pd.read_sas('CDQ_G.XPT')
CDQ_H = pd.read_sas('CDQ_H.XPT')
CDQ_I = pd.read_sas('CDQ_I.XPT')
CDQ_J = pd.read_sas('CDQ_J.XPT')
P_CDQ = pd.read_sas('P_CDQ.XPT')
cdq = pd.concat([CDQ_G, CDQ_H, CDQ_I, CDQ_J, P_CDQ], ignore_index=True)

# Medical Conditions
MCQ_G = pd.read_sas('MCQ_G.XPT')
MCQ_H = pd.read_sas('MCQ_H.XPT')
MCQ_I = pd.read_sas('MCQ_I.XPT')
MCQ_J = pd.read_sas('MCQ_J.XPT')
P_MCQ = pd.read_sas('P_MCQ.XPT')
mcq = pd.concat([MCQ_G, MCQ_H, MCQ_I, MCQ_J, P_MCQ], ignore_index=True)

# Questionnaire dataset
ques = pd.merge(pd.merge(pd.merge(mcq, bc, how='left', on='SEQN'), cdq, how='left', on='SEQN'), al, how='left', on='SEQN')
print('Duplicated data points:', ques[ques.duplicated()])
print('\nQuestionnaire dataset:')
ques

Duplicated data points: Empty DataFrame
Columns: [SEQN, MCQ010, MCQ025, MCQ035, MCQ040, MCQ050, MCQ051, MCQ053, MCQ070, MCQ075, MCQ080, MCQ082, MCQ084, MCQ086, MCQ092, MCD093, MCQ140, MCQ149, MCQ160A, MCQ180A, MCQ195, MCQ160N, MCQ180N, MCQ160B, MCQ180B, MCQ160C, MCQ180C, MCQ160D, MCQ180D, MCQ160E, MCQ180E, MCQ160F, MCQ180F, MCQ160G, MCQ180G, MCQ160M, MCQ170M, MCQ180M, MCQ160K, MCQ170K, MCQ180K, MCQ160L, MCQ170L, MCQ180L, MCQ220, MCQ230A, MCQ230B, MCQ230C, MCQ230D, MCQ240A, MCQ240AA, MCQ240B, MCQ240BB, MCQ240C, MCQ240CC, MCQ240D, MCQ240DD, MCQ240DK, MCQ240E, MCQ240F, MCQ240G, MCQ240H, MCQ240I, MCQ240J, MCQ240K, MCQ240L, MCQ240M, MCQ240N, MCQ240O, MCQ240P, MCQ240Q, MCQ240R, MCQ240S, MCQ240T, MCQ240U, MCQ240V, MCQ240W, MCQ240X, MCQ240Y, MCQ240Z, MCQ300A, MCQ300B, MCQ300C, MCQ365A, MCQ365B, MCQ365C, MCQ365D, MCQ370A, MCQ370B, MCQ370C, MCQ370D, MCQ380, AGQ030, MCQ151, MCQ160O, MCQ203, MCQ206, OSQ230, RHD018, MCD180A, ...]
Index: []

[0 rows x 182 columns]

Questionnaire dataset:


Unnamed: 0,SEQN,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,MCQ051,MCQ053,MCQ070,MCQ075,...,ALQ151,ALQ155,ALQ160,ALQ111,ALQ121,ALQ142,ALQ270,ALQ280,ALQ290,ALQ170
0,62161.0,2.0,,,,,,2.0,2.0,,...,,,,,,,,,,
1,62162.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
2,62163.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
3,62164.0,2.0,,,,,,2.0,2.0,,...,,,,,,,,,,
4,62165.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52587,124818.0,2.0,,,,,,2.0,,,...,2.0,,,1.0,9.0,5.397605e-79,,,,5.397605e-79
52588,124819.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
52589,124820.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
52590,124821.0,1.0,58.0,1.0,2.0,2.0,,2.0,,,...,2.0,,,1.0,5.0,7.000000e+00,5.397605e-79,5.397605e-79,,5.397605e-79


## **1.5. Merged dataset**

In [8]:
df = pd.merge(pd.merge(pd.merge(demo, ques, how='right', on='SEQN'),
                       exam, how='left', on='SEQN'), lab, how='left', on='SEQN')
df

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGY,...,PHAGUMMN,PHQ050,PHAANTHR,PHAANTMN,PHQ060,PHASUPHR,PHASUPMN,PHAFSTHR,PHAFSTMN,PHDSESN
0,62161.0,7.0,2.0,1.0,22.0,,3.0,3.0,2.0,,...,,2.0,,,2.0,,,14.0,37.0,5.397605e-79
1,62162.0,7.0,2.0,2.0,3.0,,1.0,1.0,1.0,3.0,...,,,,,,,,,,
2,62163.0,7.0,2.0,1.0,14.0,,5.0,6.0,2.0,14.0,...,,2.0,,,2.0,,,17.0,55.0,1.000000e+00
3,62164.0,7.0,2.0,2.0,44.0,,3.0,3.0,1.0,,...,,2.0,,,2.0,,,11.0,6.0,5.397605e-79
4,62165.0,7.0,2.0,2.0,14.0,,4.0,4.0,2.0,14.0,...,,2.0,,,2.0,,,12.0,11.0,5.397605e-79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52587,124818.0,66.0,2.0,1.0,40.0,,4.0,4.0,1.0,,...,,2.0,,,2.0,,,6.0,4.0,1.000000e+00
52588,124819.0,66.0,2.0,1.0,2.0,,4.0,4.0,2.0,,...,,,,,,,,,,
52589,124820.0,66.0,2.0,2.0,7.0,,3.0,3.0,2.0,,...,,2.0,,,2.0,,,4.0,42.0,1.000000e+00
52590,124821.0,66.0,2.0,1.0,63.0,,4.0,4.0,1.0,,...,,2.0,,,2.0,,,10.0,8.0,5.397605e-79


In [8]:
df.to_csv('df_merged.csv', index = False)

In [8]:
df_original = df.copy()
df = df_original.copy()