In [None]:
import numpy
import os
import glob
import random
import pandas
import math 
import pprint
import datetime

from framingham10yr.framingham10yr import framingham_10year_risk

from calculate_framingham_risk_score import calculate_framingham_risk_score
# My Utility Scripts
# from printd import printd
# from plots import plot_curves
# from plots import plot_boxes

In [None]:
# Import config 
data = dict()
paths = dict()

datasets = ['patients','observations','medications','patients']

data_location = 'synthea/output/csv/'

In [None]:
# datasets = ['patients']

# Import csv data
for d in datasets:
    paths[d] = data_location + d + '.csv'
    data[d] = pandas.read_csv(paths[d]).rename(str.lower, axis='columns')
    print('\nDatset ' + d)
    print('\nColumns')
    print(data[d].columns.values)

features = [
    'Systolic Blood Pressure',
    'Diastolic Blood Pressure',
    'Tobacco smoking status NHIS',
    'Body Mass Index',
    'Glucose',
    'Triglycerides',
    'High Density Lipoprotein Cholesterol',
    'Total Cholesterol',
    'Low Density Lipoprotein Cholesterol'
]



In [None]:
# Data Cleaning OBSERVATIONS data
data['observations']['description'].dropna(inplace=True)

new_feature = data['observations']['description'].str.split("(\-|\[.*\])", n=1, expand=True)

data['observations'] = data['observations'].assign(feature=new_feature[0].str.strip())

data['obs features'] = \
    data['observations'].loc[data['observations']['feature'].isin(features), 
                             ['date','patient','encounter','feature','value']]

# Process numerical features
data['obs floats'] = data['obs features'][data['obs features']['value']
                                          .str.contains(r'^[\d\.]+$')]

data['obs floats'] = data['obs floats'].assign(value=data['obs floats']['value']
                                                     .astype(float))

In [None]:
# Process categorical features
# Gather examples of smokers
data['smokers'] = data['obs features'].loc[data['obs features']['feature']
                                           .str.contains('Tobacco smoking status NHIS'),
                                           ['encounter','patient','value']]

data['smokers'] = (data['smokers']
                   .replace({'Former smoker':True, 
                             'Never smoker':False, 
                             'Current every day smoker':True})
                   .rename({'value':'smoker'}, axis='columns'))

# Data Cleaning MEDICATIONS data
# Gather examples of medication for 'Hypertension' or high blood pressure
data['medications'].dropna(inplace=True)
data['hypertension'] = data['medications'].loc[data['medications']['reasondescription']
                                               .str.contains('Hypertension'),
                                               ['start','stop','patient']]

data['hypertension'] = data['hypertension'].assign(blood_pressure_med_treatment =True)

In [None]:
# Process PATIENTS data
data['patients'] =  data['patients'][['id','birthdate','gender']].rename({'id':'patient','gender':'sex'}, axis='columns')

In [None]:
# Transpose such that features are columns with 'value as their values
data['features'] = (data['obs floats']
                    .pivot_table(index=['date','encounter','patient'], 
                                 columns='feature', values='value')
                    .reset_index()
                    .dropna()
                    .copy())

In [None]:
# Combine patient and observation data
data['features'] = pandas.merge(data['features'], data['patients'],
                        how='left', on='patient')

In [None]:
# Calculate age at time of observation
data['features'] = data['features'].assign(age=(
    pandas.to_datetime(data['features'].date) - pandas.to_datetime(data['features'].birthdate))
    .dt.days / 365.25)

In [None]:
# Join on smoker status
merged = pandas.merge(data['features'], data['smokers'],
                      how='left', on=['encounter','patient']).copy()

In [None]:
# Join on hypertention medication status
merged = pandas.merge(merged, data['hypertension'],
                      how='left', on='patient')

data['features'] = merged[
    (merged['start'].isnull() |
        ((merged['start'] <= merged['date']) & 
        (merged['stop'] >= merged['date']))
    )]

data['features'] = (data['features']
                    .assign(blood_pressure_med_treatment=data['features']['blood_pressure_med_treatment']
                                                         .fillna(False)))

In [None]:
# Clean feature names 
new_feature_names = {
    'Body Mass Index':'bmi',
    'Total Cholesterol':'total_cholesterol',
    'High Density Lipoprotein Cholesterol':'hdl_cholesterol',
    'Low Density Lipoprotein Cholesterol':'ldl_cholesterol',
    'Systolic Blood Pressure':'systolic_blood_pressure',
    'Diastolic Blood Pressure':'diastolic_blood_pressure'    
}


data['features'] = data['features'].rename(new_feature_names, axis='columns').rename(str.lower, axis='columns')

new_key = [
    'date',
    'encounter',
    'patient',    
]

new_num_features = [
    'bmi',
    'diastolic_blood_pressure',
    'systolic_blood_pressure',
    'glucose',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'total_cholesterol',
    'triglycerides',
    'age',
    'framingham'
]

new_cat_features = [
    'sex',
    'smoker',
    'blood_pressure_med_treatment'
]


In [None]:
# Framingham score
data['features'] = calculate_framingham_risk_score(data['features'],
                                                   'framingham')

data['features'][new_key+new_cat_features+new_num_features].columns.values

In [None]:
# Randomly sample from key 'patients' to create unique entry for each patient
data['unique_features'] = sample_from_key_to_unique(data=data['features'], key='patient')

In [None]:
print('New features')
print(data['features'][new_key+new_cat_features+new_num_features].columns.values)
# Can save at this point
import datetime

target_dir = 'data/'
target_file_name = 'california_' + str(len(data['features'])) + '_' + datetime.date.today().strftime("%Y%m%d")
print(target_dir + target_file_name)

data['unique_features'].to_pickle(target_dir + target_file_name)

In [None]:
import process_synthea_patient_data
from process_synthea_patient_data import process_synthea_patient_data

data_location = 'synthea/output/csv/'
target_dir = 'data/'
target_file_name = 'california_50000_' + datetime.date.today().strftime("%Y%m%d")

process_synthea_patient_data(data_dir=data_location,
                             data_save_dir=target_dir,
                             data_save_name=target_file_name).describe()