In [1]:
import pandas as pd

In [2]:
cvd_data_raw = pd.read_excel('riskchartsampledata.xlsx')

# Pertinent articles:
# https://figshare.com/articles/dataset/CVD_risk_assessment_sample_dataset/5480224
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0189389#pone-0189389-t004
# https://www.ahajournals.org/doi/10.1161/CIRCULATIONAHA.107.699579

In [3]:
cvd_data_trimmed = cvd_data_raw[['Age', 'Sex', 'Family history of CVD', 'Diabetes Mellitus', 'High WHR', 'Smoking status', 'SBP', 'Tch']]

In [4]:
cvd_data_trimmed.head(10)

Unnamed: 0,Age,Sex,Family history of CVD,Diabetes Mellitus,High WHR,Smoking status,SBP,Tch
0,55-64,male,No,Diabetes,No,Smoker,120-139 mmHg,150-200 mg/dL
1,65-74,male,No,Diabetes,No,Non-smoker,140-159 mmHg,<150 mg/dL
2,65-74,female,No,Non-diabetes,Yes,Non-smoker,>=160 mmHg,250-300 mg/dL
3,65-74,male,No,Diabetes,Yes,Smoker,>=160 mmHg,200-250 mg/dL
4,65-74,male,No,Non-diabetes,Yes,Non-smoker,>=160 mmHg,150-200 mg/dL
5,55-64,male,No,Diabetes,No,Smoker,120-139 mmHg,200-250 mg/dL
6,65-74,female,No,Non-diabetes,Yes,Smoker,>=160 mmHg,250-300 mg/dL
7,35-44,male,Yes,Diabetes,Yes,Smoker,120-139 mmHg,>=300 mg/dL
8,55-64,male,Yes,Diabetes,Yes,Smoker,120-139 mmHg,200-250 mg/dL
9,65-74,male,No,Diabetes,No,Non-smoker,>=160 mmHg,150-200 mg/dL


In [5]:
# FRS point conversion

age_converted = []
tch_converted = []
sbp_converted = []
smoking_converted = []
diabetes_converted = []

# Hard-coded value conversions... tedious, but must be done. If the value range in the 
# data doesn't match the Framingham Risk Score ranges, point values were best estimated.
for idx, row in cvd_data_trimmed.iterrows():
    if row['Sex'] == 'female':
        if row['Age'] == '35-44':
            age_converted.append(3)

        if row['Age'] == '45-54':
            age_converted.append(6)

        if row['Age'] == '55-64':
            age_converted.append(8.5)

        if row['Age'] == '65-74':
            age_converted.append(10.5)
    
        if row['Age'] == '>=75':
            age_converted.append(12)

        if row['Tch'] == '<150 mg/dL':
            tch_converted.append(0)

        if row['Tch'] == '150-200 mg/dL':
            tch_converted.append(1)

        if row['Tch'] == '200-250 mg/dL':
            tch_converted.append(3)

        if row['Tch'] == '250-300 mg/dL':
            tch_converted.append(4.5)
    
        if row['Tch'] == '>=300 mg/dL':
            tch_converted.append(5)

        if row['SBP'] == '<120 mmHg':
            sbp_converted.append(-3)

        if row['SBP'] == '120-139 mmHg':
            sbp_converted.append(0.5)

        if row['SBP'] == '140-159 mmHg':
            sbp_converted.append(3)
    
        if row['SBP'] == '>=160 mmHg':
            sbp_converted.append(5)
        
        if row['Smoking status'] == 'Smoker':
            smoking_converted.append(3)
        
        else:
            smoking_converted.append(0)

        if row['Diabetes Mellitus'] == 'Diabetes':
            diabetes_converted.append(4)
        
        else:
            diabetes_converted.append(0)
    
    if row['Sex'] == 'male':
        if row['Age'] == '35-44':
            age_converted.append(3.5)

        if row['Age'] == '45-54':
            age_converted.append(7)

        if row['Age'] == '55-64':
            age_converted.append(10.5)

        if row['Age'] == '65-74':
            age_converted.append(13)
    
        if row['Age'] == '>=75':
            age_converted.append(15)

        if row['Tch'] == '<150 mg/dL':
            tch_converted.append(0)

        if row['Tch'] == '150-200 mg/dL':
            tch_converted.append(1)

        if row['Tch'] == '200-250 mg/dL':
            tch_converted.append(2.5)

        if row['Tch'] == '250-300 mg/dL':
            tch_converted.append(3.5)
    
        if row['Tch'] == '>=300 mg/dL':
            tch_converted.append(4)

        if row['SBP'] == '<120 mmHg':
            sbp_converted.append(-2)

        if row['SBP'] == '120-139 mmHg':
            sbp_converted.append(0.5)

        if row['SBP'] == '140-159 mmHg':
            sbp_converted.append(2)
    
        if row['SBP'] == '>=160 mmHg':
            sbp_converted.append(3)
        
        if row['Smoking status'] == 'Smoker':
            smoking_converted.append(4)
        
        else:
            smoking_converted.append(0)

        if row['Diabetes Mellitus'] == 'Diabetes':
            diabetes_converted.append(3)
        
        else:
            diabetes_converted.append(0)

In [6]:
formatted_df = pd.DataFrame({
    'sex': cvd_data_trimmed['Sex'], 
    'age_points': age_converted, 
    'tch_points': tch_converted, 
    'sbp_points': sbp_converted, 
    'smoking_points': smoking_converted, 
    'diabetes_points': diabetes_converted
})

In [7]:
formatted_df['total_points'] = formatted_df[list(formatted_df.columns)[1:]].sum(axis=1)

In [8]:
formatted_df

Unnamed: 0,sex,age_points,tch_points,sbp_points,smoking_points,diabetes_points,total_points
0,male,10.5,1.0,0.5,4,3,19.0
1,male,13.0,0.0,2.0,0,3,18.0
2,female,10.5,4.5,5.0,0,0,20.0
3,male,13.0,2.5,3.0,4,3,25.5
4,male,13.0,1.0,3.0,0,0,17.0
...,...,...,...,...,...,...,...
495,female,8.5,4.5,5.0,0,0,18.0
496,female,6.0,3.0,-3.0,0,0,6.0
497,male,7.0,0.0,0.5,0,0,7.5
498,female,3.0,4.5,-3.0,0,0,4.5


In [9]:
# Creating "risk categories" based on point values.
def categorize(row):
    if row['sex'] == 'female':
        if row['total_points'] <= 5:
            return 'low_risk'
        
        elif row['total_points'] <= 13:
            return 'medium_risk'

        else:
            return 'high_risk'

    if row['sex'] == 'male':
        if row['total_points'] <= 3:
            return 'low_risk'
        
        elif row['total_points'] <= 10:
            return 'medium_risk'

        else:
            return 'high_risk'

In [10]:
formatted_df['risk'] = formatted_df.apply(lambda row: categorize(row), axis=1)

In [11]:
source_id_list = []
source_name_list = []
source_type_list = []
target_id_list = []
target_name_list = []
target_type_list = []
edge_value_list = []

for idx, row in formatted_df.iterrows():
    for i, target in enumerate(row.index[1:6]):
        source_id_list.append('patient_' + str(idx))
        source_name_list.append('patient_' + str(idx))
        source_type_list.append(row['sex'])
        target_id_list.append(target)
        target_name_list.append(target)
        target_type_list.append('target')

        edge_value_list.append(row[target])

In [12]:
data = {
    'source_id': source_id_list, 
    'source_name': source_name_list, 
    'source_type': source_type_list, 
    'target_id': target_id_list, 
    'target_name': target_name_list, 
    'target_type': target_type_list, 
    'edge_value': edge_value_list
}

In [13]:
formatted_edges_df = pd.DataFrame(data)

formatted_edges_df

Unnamed: 0,source_id,source_name,source_type,target_id,target_name,target_type,edge_value
0,patient_0,patient_0,male,age_points,age_points,target,10.5
1,patient_0,patient_0,male,tch_points,tch_points,target,1.0
2,patient_0,patient_0,male,sbp_points,sbp_points,target,0.5
3,patient_0,patient_0,male,smoking_points,smoking_points,target,4.0
4,patient_0,patient_0,male,diabetes_points,diabetes_points,target,3.0
...,...,...,...,...,...,...,...
2495,patient_499,patient_499,male,age_points,age_points,target,10.5
2496,patient_499,patient_499,male,tch_points,tch_points,target,1.0
2497,patient_499,patient_499,male,sbp_points,sbp_points,target,0.5
2498,patient_499,patient_499,male,smoking_points,smoking_points,target,4.0


In [14]:
formatted_edges_df.to_csv('cvd_cv_data_formatted.csv')